Batch upload part 17
Browse files- nl_tasks/exp100/run_ex02/ft/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex02/ft/special_tokens_map.json +24 -0
- nl_tasks/exp100/run_ex02/ft/tokenizer.json +0 -0
- nl_tasks/exp100/run_ex02/ft/tokenizer.model +3 -0
- nl_tasks/exp100/run_ex02/ft/tokenizer_config.json +43 -0
- nl_tasks/exp100/run_ex02/ft2/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex02/ft2/adapter_model.bin +3 -0
- nl_tasks/exp100/run_ex03/ft/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex03/ft/special_tokens_map.json +24 -0
- nl_tasks/exp100/run_ex03/ft/tokenizer.json +0 -0
- nl_tasks/exp100/run_ex03/ft/tokenizer.model +3 -0
- nl_tasks/exp100/run_ex03/ft/tokenizer_config.json +43 -0
- nl_tasks/exp100/run_ex03/ft2/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex03/ft2/adapter_model.bin +3 -0
- nl_tasks/exp100/run_ex03/trainer_state.json +260 -0
- nl_tasks/exp100/run_ex04/ft/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex04/ft/special_tokens_map.json +24 -0
- nl_tasks/exp100/run_ex04/ft/tokenizer.json +0 -0
- nl_tasks/exp100/run_ex04/ft/tokenizer.model +3 -0
- nl_tasks/exp100/run_ex04/ft/tokenizer_config.json +43 -0
- nl_tasks/exp100/run_ex04/ft2/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex04/ft2/adapter_model.bin +3 -0
- nl_tasks/exp100/run_ex04/trainer_state.json +260 -0
- nl_tasks/exp100/run_ex05/ft/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex05/ft/special_tokens_map.json +24 -0
- nl_tasks/exp100/run_ex05/ft/tokenizer.json +0 -0
- nl_tasks/exp100/run_ex05/ft/tokenizer.model +3 -0
- nl_tasks/exp100/run_ex05/ft/tokenizer_config.json +43 -0
- nl_tasks/exp100/run_ex05/ft2/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex05/ft2/adapter_model.bin +3 -0
- nl_tasks/exp100/run_ex05/trainer_state.json +365 -0
- nl_tasks/exp100/run_ex06/ft/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex06/ft/special_tokens_map.json +24 -0
- nl_tasks/exp100/run_ex06/ft/tokenizer.json +0 -0
- nl_tasks/exp100/run_ex06/ft/tokenizer.model +3 -0
- nl_tasks/exp100/run_ex06/ft/tokenizer_config.json +43 -0
- nl_tasks/exp100/run_ex06/ft2/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex06/ft2/adapter_model.bin +3 -0
- nl_tasks/exp100/run_ex06/trainer_state.json +183 -0
- nl_tasks/exp100/run_ex07/ft/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex07/ft/special_tokens_map.json +24 -0
- nl_tasks/exp100/run_ex07/ft/tokenizer.json +0 -0
- nl_tasks/exp100/run_ex07/ft/tokenizer.model +3 -0
- nl_tasks/exp100/run_ex07/ft/tokenizer_config.json +43 -0
- nl_tasks/exp100/run_ex07/ft2/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex07/ft2/adapter_model.bin +3 -0
- nl_tasks/exp100/run_ex07/trainer_state.json +260 -0
- nl_tasks/exp100/run_ex08/ft/special_tokens_map.json +24 -0
- nl_tasks/exp100/run_ex08/ft/tokenizer_config.json +43 -0
- nl_tasks/exp100/run_ex08/trainer_state.json +183 -0
nl_tasks/exp100/run_ex02/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex02/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exp100/run_ex02/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exp100/run_ex02/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exp100/run_ex02/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exp100/run_ex02/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex02/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:02608c9a9b5b0dcbf11d02401bcc8d4e1fd5f0bd460dee7c3adc0605897bd4e4
|
| 3 |
+
size 33602915
|
nl_tasks/exp100/run_ex03/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex03/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exp100/run_ex03/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exp100/run_ex03/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exp100/run_ex03/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exp100/run_ex03/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex03/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f6d4bcfd74a9f9c4a5ca4334bde6a2536a60517a9efc3aa1ff8e80f64d3159d0
|
| 3 |
+
size 33602915
|
nl_tasks/exp100/run_ex03/trainer_state.json
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 6250,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.064,
|
| 14 |
+
"grad_norm": 0.11334197223186493,
|
| 15 |
+
"learning_rate": 0.009988082511541485,
|
| 16 |
+
"loss": 0.5559,
|
| 17 |
+
"step": 200
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.128,
|
| 21 |
+
"grad_norm": 0.061305977404117584,
|
| 22 |
+
"learning_rate": 0.00992740539380525,
|
| 23 |
+
"loss": 0.2971,
|
| 24 |
+
"step": 400
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.192,
|
| 28 |
+
"grad_norm": 0.06917975842952728,
|
| 29 |
+
"learning_rate": 0.00981595390941903,
|
| 30 |
+
"loss": 0.2748,
|
| 31 |
+
"step": 600
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.256,
|
| 35 |
+
"grad_norm": 0.06461716443300247,
|
| 36 |
+
"learning_rate": 0.009654876508343738,
|
| 37 |
+
"loss": 0.2628,
|
| 38 |
+
"step": 800
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.32,
|
| 42 |
+
"grad_norm": 0.04892360046505928,
|
| 43 |
+
"learning_rate": 0.00944583300997063,
|
| 44 |
+
"loss": 0.2518,
|
| 45 |
+
"step": 1000
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.384,
|
| 49 |
+
"grad_norm": 0.03902921453118324,
|
| 50 |
+
"learning_rate": 0.00919097749954009,
|
| 51 |
+
"loss": 0.244,
|
| 52 |
+
"step": 1200
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.448,
|
| 56 |
+
"grad_norm": 0.042787957936525345,
|
| 57 |
+
"learning_rate": 0.008892936131406222,
|
| 58 |
+
"loss": 0.2388,
|
| 59 |
+
"step": 1400
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.512,
|
| 63 |
+
"grad_norm": 0.03384287655353546,
|
| 64 |
+
"learning_rate": 0.008554780067873127,
|
| 65 |
+
"loss": 0.2374,
|
| 66 |
+
"step": 1600
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.576,
|
| 70 |
+
"grad_norm": 0.03241891786456108,
|
| 71 |
+
"learning_rate": 0.008179993832454205,
|
| 72 |
+
"loss": 0.234,
|
| 73 |
+
"step": 1800
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.64,
|
| 77 |
+
"grad_norm": 0.028277236968278885,
|
| 78 |
+
"learning_rate": 0.007772439403657747,
|
| 79 |
+
"loss": 0.2328,
|
| 80 |
+
"step": 2000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.704,
|
| 84 |
+
"grad_norm": 0.03353444114327431,
|
| 85 |
+
"learning_rate": 0.007336316419293858,
|
| 86 |
+
"loss": 0.2262,
|
| 87 |
+
"step": 2200
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.768,
|
| 91 |
+
"grad_norm": 0.026780247688293457,
|
| 92 |
+
"learning_rate": 0.006876118901376725,
|
| 93 |
+
"loss": 0.2233,
|
| 94 |
+
"step": 2400
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.832,
|
| 98 |
+
"grad_norm": 0.027920261025428772,
|
| 99 |
+
"learning_rate": 0.006396588947549764,
|
| 100 |
+
"loss": 0.2206,
|
| 101 |
+
"step": 2600
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.896,
|
| 105 |
+
"grad_norm": 0.026285970583558083,
|
| 106 |
+
"learning_rate": 0.005902667866219604,
|
| 107 |
+
"loss": 0.2173,
|
| 108 |
+
"step": 2800
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.96,
|
| 112 |
+
"grad_norm": 0.027279643341898918,
|
| 113 |
+
"learning_rate": 0.005399445258926061,
|
| 114 |
+
"loss": 0.2156,
|
| 115 |
+
"step": 3000
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.024,
|
| 119 |
+
"grad_norm": 0.02697896584868431,
|
| 120 |
+
"learning_rate": 0.004892106574628014,
|
| 121 |
+
"loss": 0.2049,
|
| 122 |
+
"step": 3200
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.088,
|
| 126 |
+
"grad_norm": 0.027274351567029953,
|
| 127 |
+
"learning_rate": 0.004385879676331144,
|
| 128 |
+
"loss": 0.1914,
|
| 129 |
+
"step": 3400
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.152,
|
| 133 |
+
"grad_norm": 0.02400428242981434,
|
| 134 |
+
"learning_rate": 0.003885980970660839,
|
| 135 |
+
"loss": 0.1937,
|
| 136 |
+
"step": 3600
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.216,
|
| 140 |
+
"grad_norm": 0.02517438866198063,
|
| 141 |
+
"learning_rate": 0.0033975616554871714,
|
| 142 |
+
"loss": 0.1913,
|
| 143 |
+
"step": 3800
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.28,
|
| 147 |
+
"grad_norm": 0.022209836170077324,
|
| 148 |
+
"learning_rate": 0.0029256546394924123,
|
| 149 |
+
"loss": 0.1883,
|
| 150 |
+
"step": 4000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 1.3439999999999999,
|
| 154 |
+
"grad_norm": 0.021254699677228928,
|
| 155 |
+
"learning_rate": 0.0024751226806475142,
|
| 156 |
+
"loss": 0.1868,
|
| 157 |
+
"step": 4200
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 1.408,
|
| 161 |
+
"grad_norm": 0.021861741319298744,
|
| 162 |
+
"learning_rate": 0.002050608278003736,
|
| 163 |
+
"loss": 0.1848,
|
| 164 |
+
"step": 4400
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 1.472,
|
| 168 |
+
"grad_norm": 0.02483428828418255,
|
| 169 |
+
"learning_rate": 0.0016564858331386562,
|
| 170 |
+
"loss": 0.1848,
|
| 171 |
+
"step": 4600
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 1.536,
|
| 175 |
+
"grad_norm": 0.01872268132865429,
|
| 176 |
+
"learning_rate": 0.0012968165742081217,
|
| 177 |
+
"loss": 0.1775,
|
| 178 |
+
"step": 4800
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 1.6,
|
| 182 |
+
"grad_norm": 0.0213455967605114,
|
| 183 |
+
"learning_rate": 0.0009753067070884736,
|
| 184 |
+
"loss": 0.182,
|
| 185 |
+
"step": 5000
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 1.6640000000000001,
|
| 189 |
+
"grad_norm": 0.025262294337153435,
|
| 190 |
+
"learning_rate": 0.0006952692248399689,
|
| 191 |
+
"loss": 0.1778,
|
| 192 |
+
"step": 5200
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 1.728,
|
| 196 |
+
"grad_norm": 0.023666556924581528,
|
| 197 |
+
"learning_rate": 0.0004595897690250567,
|
| 198 |
+
"loss": 0.1763,
|
| 199 |
+
"step": 5400
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 1.792,
|
| 203 |
+
"grad_norm": 0.020242227241396904,
|
| 204 |
+
"learning_rate": 0.0002706968946630728,
|
| 205 |
+
"loss": 0.1744,
|
| 206 |
+
"step": 5600
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 1.8559999999999999,
|
| 210 |
+
"grad_norm": 0.020292259752750397,
|
| 211 |
+
"learning_rate": 0.00013053704522556165,
|
| 212 |
+
"loss": 0.1776,
|
| 213 |
+
"step": 5800
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 1.92,
|
| 217 |
+
"grad_norm": 0.024162383750081062,
|
| 218 |
+
"learning_rate": 4.055449554200896e-05,
|
| 219 |
+
"loss": 0.1721,
|
| 220 |
+
"step": 6000
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 1.984,
|
| 224 |
+
"grad_norm": 0.02264169603586197,
|
| 225 |
+
"learning_rate": 1.6764692939641446e-06,
|
| 226 |
+
"loss": 0.1728,
|
| 227 |
+
"step": 6200
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 2.0,
|
| 231 |
+
"step": 6250,
|
| 232 |
+
"total_flos": 4.0647058784256e+18,
|
| 233 |
+
"train_loss": 0.22118227462768555,
|
| 234 |
+
"train_runtime": 5603.7872,
|
| 235 |
+
"train_samples_per_second": 35.69,
|
| 236 |
+
"train_steps_per_second": 1.115
|
| 237 |
+
}
|
| 238 |
+
],
|
| 239 |
+
"logging_steps": 200,
|
| 240 |
+
"max_steps": 6250,
|
| 241 |
+
"num_input_tokens_seen": 0,
|
| 242 |
+
"num_train_epochs": 2,
|
| 243 |
+
"save_steps": 0,
|
| 244 |
+
"stateful_callbacks": {
|
| 245 |
+
"TrainerControl": {
|
| 246 |
+
"args": {
|
| 247 |
+
"should_epoch_stop": false,
|
| 248 |
+
"should_evaluate": false,
|
| 249 |
+
"should_log": false,
|
| 250 |
+
"should_save": true,
|
| 251 |
+
"should_training_stop": true
|
| 252 |
+
},
|
| 253 |
+
"attributes": {}
|
| 254 |
+
}
|
| 255 |
+
},
|
| 256 |
+
"total_flos": 4.0647058784256e+18,
|
| 257 |
+
"train_batch_size": 32,
|
| 258 |
+
"trial_name": null,
|
| 259 |
+
"trial_params": null
|
| 260 |
+
}
|
nl_tasks/exp100/run_ex04/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex04/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exp100/run_ex04/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exp100/run_ex04/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exp100/run_ex04/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exp100/run_ex04/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex04/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8f3ae0266c1a6651903cc003f29f14607c131c865fab0338ef1685771b547d6b
|
| 3 |
+
size 33602915
|
nl_tasks/exp100/run_ex04/trainer_state.json
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 6250,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.064,
|
| 14 |
+
"grad_norm": 0.04368972033262253,
|
| 15 |
+
"learning_rate": 0.049940412557707425,
|
| 16 |
+
"loss": 0.869,
|
| 17 |
+
"step": 200
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.128,
|
| 21 |
+
"grad_norm": 0.028173571452498436,
|
| 22 |
+
"learning_rate": 0.049637026969026256,
|
| 23 |
+
"loss": 0.2959,
|
| 24 |
+
"step": 400
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.192,
|
| 28 |
+
"grad_norm": 0.023793907836079597,
|
| 29 |
+
"learning_rate": 0.049079769547095156,
|
| 30 |
+
"loss": 0.2789,
|
| 31 |
+
"step": 600
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.256,
|
| 35 |
+
"grad_norm": 0.029163537546992302,
|
| 36 |
+
"learning_rate": 0.048274382541718695,
|
| 37 |
+
"loss": 0.2695,
|
| 38 |
+
"step": 800
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.32,
|
| 42 |
+
"grad_norm": 0.026763763278722763,
|
| 43 |
+
"learning_rate": 0.047229165049853146,
|
| 44 |
+
"loss": 0.2608,
|
| 45 |
+
"step": 1000
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.384,
|
| 49 |
+
"grad_norm": 0.020129157230257988,
|
| 50 |
+
"learning_rate": 0.04595488749770045,
|
| 51 |
+
"loss": 0.2529,
|
| 52 |
+
"step": 1200
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.448,
|
| 56 |
+
"grad_norm": 0.02349601686000824,
|
| 57 |
+
"learning_rate": 0.0444646806570311,
|
| 58 |
+
"loss": 0.2478,
|
| 59 |
+
"step": 1400
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.512,
|
| 63 |
+
"grad_norm": 0.017336582764983177,
|
| 64 |
+
"learning_rate": 0.042773900339365636,
|
| 65 |
+
"loss": 0.2475,
|
| 66 |
+
"step": 1600
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.576,
|
| 70 |
+
"grad_norm": 0.017148617655038834,
|
| 71 |
+
"learning_rate": 0.040899969162271024,
|
| 72 |
+
"loss": 0.2448,
|
| 73 |
+
"step": 1800
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.64,
|
| 77 |
+
"grad_norm": 0.014803516678512096,
|
| 78 |
+
"learning_rate": 0.03886219701828874,
|
| 79 |
+
"loss": 0.2445,
|
| 80 |
+
"step": 2000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.704,
|
| 84 |
+
"grad_norm": 0.012438619509339333,
|
| 85 |
+
"learning_rate": 0.03668158209646929,
|
| 86 |
+
"loss": 0.238,
|
| 87 |
+
"step": 2200
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.768,
|
| 91 |
+
"grad_norm": 0.014171008951961994,
|
| 92 |
+
"learning_rate": 0.034380594506883626,
|
| 93 |
+
"loss": 0.2348,
|
| 94 |
+
"step": 2400
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.832,
|
| 98 |
+
"grad_norm": 0.01251581683754921,
|
| 99 |
+
"learning_rate": 0.03198294473774882,
|
| 100 |
+
"loss": 0.2325,
|
| 101 |
+
"step": 2600
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.896,
|
| 105 |
+
"grad_norm": 0.011021828278899193,
|
| 106 |
+
"learning_rate": 0.029513339331098024,
|
| 107 |
+
"loss": 0.2287,
|
| 108 |
+
"step": 2800
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.96,
|
| 112 |
+
"grad_norm": 0.011575430631637573,
|
| 113 |
+
"learning_rate": 0.026997226294630305,
|
| 114 |
+
"loss": 0.2273,
|
| 115 |
+
"step": 3000
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.024,
|
| 119 |
+
"grad_norm": 0.011999037116765976,
|
| 120 |
+
"learning_rate": 0.024460532873140067,
|
| 121 |
+
"loss": 0.2178,
|
| 122 |
+
"step": 3200
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.088,
|
| 126 |
+
"grad_norm": 0.012413745746016502,
|
| 127 |
+
"learning_rate": 0.021929398381655724,
|
| 128 |
+
"loss": 0.2063,
|
| 129 |
+
"step": 3400
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.152,
|
| 133 |
+
"grad_norm": 0.012539232149720192,
|
| 134 |
+
"learning_rate": 0.019429904853304196,
|
| 135 |
+
"loss": 0.2096,
|
| 136 |
+
"step": 3600
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.216,
|
| 140 |
+
"grad_norm": 0.011288085952401161,
|
| 141 |
+
"learning_rate": 0.016987808277435856,
|
| 142 |
+
"loss": 0.2062,
|
| 143 |
+
"step": 3800
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.28,
|
| 147 |
+
"grad_norm": 0.01744219847023487,
|
| 148 |
+
"learning_rate": 0.014628273197462061,
|
| 149 |
+
"loss": 0.2029,
|
| 150 |
+
"step": 4000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 1.3439999999999999,
|
| 154 |
+
"grad_norm": 0.011675246991217136,
|
| 155 |
+
"learning_rate": 0.01237561340323757,
|
| 156 |
+
"loss": 0.2011,
|
| 157 |
+
"step": 4200
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 1.408,
|
| 161 |
+
"grad_norm": 0.01028984971344471,
|
| 162 |
+
"learning_rate": 0.010253041390018681,
|
| 163 |
+
"loss": 0.1981,
|
| 164 |
+
"step": 4400
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 1.472,
|
| 168 |
+
"grad_norm": 0.009951326064765453,
|
| 169 |
+
"learning_rate": 0.008282429165693281,
|
| 170 |
+
"loss": 0.1974,
|
| 171 |
+
"step": 4600
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 1.536,
|
| 175 |
+
"grad_norm": 0.00813743844628334,
|
| 176 |
+
"learning_rate": 0.006484082871040609,
|
| 177 |
+
"loss": 0.1889,
|
| 178 |
+
"step": 4800
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 1.6,
|
| 182 |
+
"grad_norm": 0.009917319752275944,
|
| 183 |
+
"learning_rate": 0.004876533535442368,
|
| 184 |
+
"loss": 0.1921,
|
| 185 |
+
"step": 5000
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 1.6640000000000001,
|
| 189 |
+
"grad_norm": 0.010274921543896198,
|
| 190 |
+
"learning_rate": 0.003476346124199845,
|
| 191 |
+
"loss": 0.1873,
|
| 192 |
+
"step": 5200
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 1.728,
|
| 196 |
+
"grad_norm": 0.011271242052316666,
|
| 197 |
+
"learning_rate": 0.0022979488451252833,
|
| 198 |
+
"loss": 0.1844,
|
| 199 |
+
"step": 5400
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 1.792,
|
| 203 |
+
"grad_norm": 0.009762358851730824,
|
| 204 |
+
"learning_rate": 0.001353484473315364,
|
| 205 |
+
"loss": 0.1814,
|
| 206 |
+
"step": 5600
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 1.8559999999999999,
|
| 210 |
+
"grad_norm": 0.00860436912626028,
|
| 211 |
+
"learning_rate": 0.0006526852261278083,
|
| 212 |
+
"loss": 0.1838,
|
| 213 |
+
"step": 5800
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 1.92,
|
| 217 |
+
"grad_norm": 0.010398217476904392,
|
| 218 |
+
"learning_rate": 0.00020277247771004482,
|
| 219 |
+
"loss": 0.1775,
|
| 220 |
+
"step": 6000
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 1.984,
|
| 224 |
+
"grad_norm": 0.010314074344933033,
|
| 225 |
+
"learning_rate": 8.382346469820723e-06,
|
| 226 |
+
"loss": 0.1783,
|
| 227 |
+
"step": 6200
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 2.0,
|
| 231 |
+
"step": 6250,
|
| 232 |
+
"total_flos": 4.0647058784256e+18,
|
| 233 |
+
"train_loss": 0.2409750535583496,
|
| 234 |
+
"train_runtime": 5596.8758,
|
| 235 |
+
"train_samples_per_second": 35.734,
|
| 236 |
+
"train_steps_per_second": 1.117
|
| 237 |
+
}
|
| 238 |
+
],
|
| 239 |
+
"logging_steps": 200,
|
| 240 |
+
"max_steps": 6250,
|
| 241 |
+
"num_input_tokens_seen": 0,
|
| 242 |
+
"num_train_epochs": 2,
|
| 243 |
+
"save_steps": 0,
|
| 244 |
+
"stateful_callbacks": {
|
| 245 |
+
"TrainerControl": {
|
| 246 |
+
"args": {
|
| 247 |
+
"should_epoch_stop": false,
|
| 248 |
+
"should_evaluate": false,
|
| 249 |
+
"should_log": false,
|
| 250 |
+
"should_save": true,
|
| 251 |
+
"should_training_stop": true
|
| 252 |
+
},
|
| 253 |
+
"attributes": {}
|
| 254 |
+
}
|
| 255 |
+
},
|
| 256 |
+
"total_flos": 4.0647058784256e+18,
|
| 257 |
+
"train_batch_size": 32,
|
| 258 |
+
"trial_name": null,
|
| 259 |
+
"trial_params": null
|
| 260 |
+
}
|
nl_tasks/exp100/run_ex05/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex05/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exp100/run_ex05/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exp100/run_ex05/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exp100/run_ex05/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exp100/run_ex05/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex05/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:92a6b1824b00e9dee64d31ee13cebccc364da375813e22cd7363a5dba7c0f92d
|
| 3 |
+
size 33602915
|
nl_tasks/exp100/run_ex05/trainer_state.json
ADDED
|
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 9375,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.064,
|
| 14 |
+
"grad_norm": 0.1212976798415184,
|
| 15 |
+
"learning_rate": 0.00999684221114305,
|
| 16 |
+
"loss": 0.6038,
|
| 17 |
+
"step": 200
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.128,
|
| 21 |
+
"grad_norm": 0.065467968583107,
|
| 22 |
+
"learning_rate": 0.009973376564462873,
|
| 23 |
+
"loss": 0.2948,
|
| 24 |
+
"step": 400
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.192,
|
| 28 |
+
"grad_norm": 0.06629683822393417,
|
| 29 |
+
"learning_rate": 0.009927125570277145,
|
| 30 |
+
"loss": 0.2742,
|
| 31 |
+
"step": 600
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.256,
|
| 35 |
+
"grad_norm": 0.06881999224424362,
|
| 36 |
+
"learning_rate": 0.009858301125867589,
|
| 37 |
+
"loss": 0.2629,
|
| 38 |
+
"step": 800
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.32,
|
| 42 |
+
"grad_norm": 0.14311932027339935,
|
| 43 |
+
"learning_rate": 0.009767218547973728,
|
| 44 |
+
"loss": 0.253,
|
| 45 |
+
"step": 1000
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.384,
|
| 49 |
+
"grad_norm": 0.052844878286123276,
|
| 50 |
+
"learning_rate": 0.009654295128180494,
|
| 51 |
+
"loss": 0.2444,
|
| 52 |
+
"step": 1200
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.448,
|
| 56 |
+
"grad_norm": 0.045355528593063354,
|
| 57 |
+
"learning_rate": 0.009520048221111679,
|
| 58 |
+
"loss": 0.2394,
|
| 59 |
+
"step": 1400
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.512,
|
| 63 |
+
"grad_norm": 0.03619716316461563,
|
| 64 |
+
"learning_rate": 0.009365092874188177,
|
| 65 |
+
"loss": 0.2379,
|
| 66 |
+
"step": 1600
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.576,
|
| 70 |
+
"grad_norm": 0.03572649136185646,
|
| 71 |
+
"learning_rate": 0.00919013900981014,
|
| 72 |
+
"loss": 0.2351,
|
| 73 |
+
"step": 1800
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.64,
|
| 77 |
+
"grad_norm": 0.02887910045683384,
|
| 78 |
+
"learning_rate": 0.008995988172872798,
|
| 79 |
+
"loss": 0.2343,
|
| 80 |
+
"step": 2000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.704,
|
| 84 |
+
"grad_norm": 0.030390536412596703,
|
| 85 |
+
"learning_rate": 0.008783529858517077,
|
| 86 |
+
"loss": 0.2281,
|
| 87 |
+
"step": 2200
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.768,
|
| 91 |
+
"grad_norm": 0.02559584006667137,
|
| 92 |
+
"learning_rate": 0.008553737436939324,
|
| 93 |
+
"loss": 0.226,
|
| 94 |
+
"step": 2400
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.832,
|
| 98 |
+
"grad_norm": 0.028384791687130928,
|
| 99 |
+
"learning_rate": 0.008307663693930425,
|
| 100 |
+
"loss": 0.2239,
|
| 101 |
+
"step": 2600
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.896,
|
| 105 |
+
"grad_norm": 0.022031353786587715,
|
| 106 |
+
"learning_rate": 0.00804643600757522,
|
| 107 |
+
"loss": 0.2205,
|
| 108 |
+
"step": 2800
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.96,
|
| 112 |
+
"grad_norm": 0.02517508529126644,
|
| 113 |
+
"learning_rate": 0.007771251183209993,
|
| 114 |
+
"loss": 0.2194,
|
| 115 |
+
"step": 3000
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.024,
|
| 119 |
+
"grad_norm": 0.02236510068178177,
|
| 120 |
+
"learning_rate": 0.007483369970301455,
|
| 121 |
+
"loss": 0.2098,
|
| 122 |
+
"step": 3200
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.088,
|
| 126 |
+
"grad_norm": 0.02561621367931366,
|
| 127 |
+
"learning_rate": 0.0071841112863680005,
|
| 128 |
+
"loss": 0.1984,
|
| 129 |
+
"step": 3400
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.152,
|
| 133 |
+
"grad_norm": 0.024599742144346237,
|
| 134 |
+
"learning_rate": 0.006874846174406093,
|
| 135 |
+
"loss": 0.2025,
|
| 136 |
+
"step": 3600
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.216,
|
| 140 |
+
"grad_norm": 0.02521314099431038,
|
| 141 |
+
"learning_rate": 0.006556991521505633,
|
| 142 |
+
"loss": 0.2005,
|
| 143 |
+
"step": 3800
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.28,
|
| 147 |
+
"grad_norm": 0.019666949287056923,
|
| 148 |
+
"learning_rate": 0.006232003567432242,
|
| 149 |
+
"loss": 0.1988,
|
| 150 |
+
"step": 4000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 1.3439999999999999,
|
| 154 |
+
"grad_norm": 0.020948218181729317,
|
| 155 |
+
"learning_rate": 0.005901371232916675,
|
| 156 |
+
"loss": 0.1985,
|
| 157 |
+
"step": 4200
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 1.408,
|
| 161 |
+
"grad_norm": 0.022395219653844833,
|
| 162 |
+
"learning_rate": 0.005566609298217553,
|
| 163 |
+
"loss": 0.1972,
|
| 164 |
+
"step": 4400
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 1.472,
|
| 168 |
+
"grad_norm": 0.020159346982836723,
|
| 169 |
+
"learning_rate": 0.005229251463209568,
|
| 170 |
+
"loss": 0.198,
|
| 171 |
+
"step": 4600
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 1.536,
|
| 175 |
+
"grad_norm": 0.016137801110744476,
|
| 176 |
+
"learning_rate": 0.004890843320792184,
|
| 177 |
+
"loss": 0.1915,
|
| 178 |
+
"step": 4800
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 1.6,
|
| 182 |
+
"grad_norm": 0.017183274030685425,
|
| 183 |
+
"learning_rate": 0.004552935275810833,
|
| 184 |
+
"loss": 0.1971,
|
| 185 |
+
"step": 5000
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 1.6640000000000001,
|
| 189 |
+
"grad_norm": 0.01746981218457222,
|
| 190 |
+
"learning_rate": 0.004217075441932357,
|
| 191 |
+
"loss": 0.1929,
|
| 192 |
+
"step": 5200
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 1.728,
|
| 196 |
+
"grad_norm": 0.01835208386182785,
|
| 197 |
+
"learning_rate": 0.0038848025490174254,
|
| 198 |
+
"loss": 0.191,
|
| 199 |
+
"step": 5400
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 1.792,
|
| 203 |
+
"grad_norm": 0.017558401450514793,
|
| 204 |
+
"learning_rate": 0.0035576388934845005,
|
| 205 |
+
"loss": 0.1889,
|
| 206 |
+
"step": 5600
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 1.8559999999999999,
|
| 210 |
+
"grad_norm": 0.016855215653777122,
|
| 211 |
+
"learning_rate": 0.003237083363963042,
|
| 212 |
+
"loss": 0.1914,
|
| 213 |
+
"step": 5800
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 1.92,
|
| 217 |
+
"grad_norm": 0.01803995855152607,
|
| 218 |
+
"learning_rate": 0.0029246045741886696,
|
| 219 |
+
"loss": 0.185,
|
| 220 |
+
"step": 6000
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 1.984,
|
| 224 |
+
"grad_norm": 0.019782407209277153,
|
| 225 |
+
"learning_rate": 0.0026216341346016613,
|
| 226 |
+
"loss": 0.1845,
|
| 227 |
+
"step": 6200
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 2.048,
|
| 231 |
+
"grad_norm": 0.018053608015179634,
|
| 232 |
+
"learning_rate": 0.0023295600934747397,
|
| 233 |
+
"loss": 0.1674,
|
| 234 |
+
"step": 6400
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 2.112,
|
| 238 |
+
"grad_norm": 0.021936679258942604,
|
| 239 |
+
"learning_rate": 0.002049720577619374,
|
| 240 |
+
"loss": 0.1611,
|
| 241 |
+
"step": 6600
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"epoch": 2.176,
|
| 245 |
+
"grad_norm": 0.01545505877584219,
|
| 246 |
+
"learning_rate": 0.0017833976618054676,
|
| 247 |
+
"loss": 0.1611,
|
| 248 |
+
"step": 6800
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"epoch": 2.24,
|
| 252 |
+
"grad_norm": 0.018597135320305824,
|
| 253 |
+
"learning_rate": 0.001531811494981501,
|
| 254 |
+
"loss": 0.1582,
|
| 255 |
+
"step": 7000
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"epoch": 2.304,
|
| 259 |
+
"grad_norm": 0.01862194575369358,
|
| 260 |
+
"learning_rate": 0.001296114710205592,
|
| 261 |
+
"loss": 0.1617,
|
| 262 |
+
"step": 7200
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 2.368,
|
| 266 |
+
"grad_norm": 0.017578421160578728,
|
| 267 |
+
"learning_rate": 0.0010773871438982197,
|
| 268 |
+
"loss": 0.1546,
|
| 269 |
+
"step": 7400
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"epoch": 2.432,
|
| 273 |
+
"grad_norm": 0.01894843764603138,
|
| 274 |
+
"learning_rate": 0.0008766308886101404,
|
| 275 |
+
"loss": 0.1593,
|
| 276 |
+
"step": 7600
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"epoch": 2.496,
|
| 280 |
+
"grad_norm": 0.01759357750415802,
|
| 281 |
+
"learning_rate": 0.0006947657019710795,
|
| 282 |
+
"loss": 0.1569,
|
| 283 |
+
"step": 7800
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"epoch": 2.56,
|
| 287 |
+
"grad_norm": 0.0183447003364563,
|
| 288 |
+
"learning_rate": 0.0005326247928529187,
|
| 289 |
+
"loss": 0.1597,
|
| 290 |
+
"step": 8000
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 2.624,
|
| 294 |
+
"grad_norm": 0.01979999430477619,
|
| 295 |
+
"learning_rate": 0.000390951004052949,
|
| 296 |
+
"loss": 0.1559,
|
| 297 |
+
"step": 8200
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"epoch": 2.6879999999999997,
|
| 301 |
+
"grad_norm": 0.01742800511419773,
|
| 302 |
+
"learning_rate": 0.0002703934089860627,
|
| 303 |
+
"loss": 0.1531,
|
| 304 |
+
"step": 8400
|
| 305 |
+
},
|
| 306 |
+
{
|
| 307 |
+
"epoch": 2.752,
|
| 308 |
+
"grad_norm": 0.018611254170536995,
|
| 309 |
+
"learning_rate": 0.00017150433797803011,
|
| 310 |
+
"loss": 0.1532,
|
| 311 |
+
"step": 8600
|
| 312 |
+
},
|
| 313 |
+
{
|
| 314 |
+
"epoch": 2.816,
|
| 315 |
+
"grad_norm": 0.019568437710404396,
|
| 316 |
+
"learning_rate": 9.473684778379676e-05,
|
| 317 |
+
"loss": 0.1549,
|
| 318 |
+
"step": 8800
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"epoch": 2.88,
|
| 322 |
+
"grad_norm": 0.01714991219341755,
|
| 323 |
+
"learning_rate": 4.04426459241064e-05,
|
| 324 |
+
"loss": 0.1533,
|
| 325 |
+
"step": 9000
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"epoch": 2.944,
|
| 329 |
+
"grad_norm": 0.02003113366663456,
|
| 330 |
+
"learning_rate": 8.87047935002272e-06,
|
| 331 |
+
"loss": 0.1541,
|
| 332 |
+
"step": 9200
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"epoch": 3.0,
|
| 336 |
+
"step": 9375,
|
| 337 |
+
"total_flos": 6.0970588176384e+18,
|
| 338 |
+
"train_loss": 0.2053676788330078,
|
| 339 |
+
"train_runtime": 8368.9804,
|
| 340 |
+
"train_samples_per_second": 35.847,
|
| 341 |
+
"train_steps_per_second": 1.12
|
| 342 |
+
}
|
| 343 |
+
],
|
| 344 |
+
"logging_steps": 200,
|
| 345 |
+
"max_steps": 9375,
|
| 346 |
+
"num_input_tokens_seen": 0,
|
| 347 |
+
"num_train_epochs": 3,
|
| 348 |
+
"save_steps": 0,
|
| 349 |
+
"stateful_callbacks": {
|
| 350 |
+
"TrainerControl": {
|
| 351 |
+
"args": {
|
| 352 |
+
"should_epoch_stop": false,
|
| 353 |
+
"should_evaluate": false,
|
| 354 |
+
"should_log": false,
|
| 355 |
+
"should_save": true,
|
| 356 |
+
"should_training_stop": true
|
| 357 |
+
},
|
| 358 |
+
"attributes": {}
|
| 359 |
+
}
|
| 360 |
+
},
|
| 361 |
+
"total_flos": 6.0970588176384e+18,
|
| 362 |
+
"train_batch_size": 32,
|
| 363 |
+
"trial_name": null,
|
| 364 |
+
"trial_params": null
|
| 365 |
+
}
|
nl_tasks/exp100/run_ex06/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex06/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exp100/run_ex06/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exp100/run_ex06/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exp100/run_ex06/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exp100/run_ex06/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex06/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b6eff2d5fa7579af4e75d06b790b1a0331b1bcb1e9d1a3259da889964fbf273
|
| 3 |
+
size 33602915
|
nl_tasks/exp100/run_ex06/trainer_state.json
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 4168,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.09596928982725528,
|
| 14 |
+
"grad_norm": 0.6366388201713562,
|
| 15 |
+
"learning_rate": 0.009964316835038782,
|
| 16 |
+
"loss": 5.3796,
|
| 17 |
+
"step": 200
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.19193857965451055,
|
| 21 |
+
"grad_norm": 0.05433971807360649,
|
| 22 |
+
"learning_rate": 0.00981641298054017,
|
| 23 |
+
"loss": 2.1242,
|
| 24 |
+
"step": 400
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.28790786948176583,
|
| 28 |
+
"grad_norm": 0.05430266633629799,
|
| 29 |
+
"learning_rate": 0.00955703204967729,
|
| 30 |
+
"loss": 0.2875,
|
| 31 |
+
"step": 600
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.3838771593090211,
|
| 35 |
+
"grad_norm": 0.042459528893232346,
|
| 36 |
+
"learning_rate": 0.009192177478607726,
|
| 37 |
+
"loss": 0.2663,
|
| 38 |
+
"step": 800
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.4798464491362764,
|
| 42 |
+
"grad_norm": 0.038593146950006485,
|
| 43 |
+
"learning_rate": 0.008730293917124864,
|
| 44 |
+
"loss": 0.251,
|
| 45 |
+
"step": 1000
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.5758157389635317,
|
| 49 |
+
"grad_norm": 0.04011659324169159,
|
| 50 |
+
"learning_rate": 0.008182071775138081,
|
| 51 |
+
"loss": 0.2465,
|
| 52 |
+
"step": 1200
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.6717850287907869,
|
| 56 |
+
"grad_norm": 0.0388704277575016,
|
| 57 |
+
"learning_rate": 0.007560199790476499,
|
| 58 |
+
"loss": 0.241,
|
| 59 |
+
"step": 1400
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.7677543186180422,
|
| 63 |
+
"grad_norm": 0.03452278673648834,
|
| 64 |
+
"learning_rate": 0.006879071344895999,
|
| 65 |
+
"loss": 0.2317,
|
| 66 |
+
"step": 1600
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.8637236084452975,
|
| 70 |
+
"grad_norm": 0.035173822194337845,
|
| 71 |
+
"learning_rate": 0.006154451325678603,
|
| 72 |
+
"loss": 0.2279,
|
| 73 |
+
"step": 1800
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.9596928982725528,
|
| 77 |
+
"grad_norm": 0.03411315754055977,
|
| 78 |
+
"learning_rate": 0.005403111243395296,
|
| 79 |
+
"loss": 0.2222,
|
| 80 |
+
"step": 2000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.055662188099808,
|
| 84 |
+
"grad_norm": 0.03330094739794731,
|
| 85 |
+
"learning_rate": 0.004642441051122492,
|
| 86 |
+
"loss": 0.2085,
|
| 87 |
+
"step": 2200
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 1.1516314779270633,
|
| 91 |
+
"grad_norm": 0.03454425185918808,
|
| 92 |
+
"learning_rate": 0.003890046649652885,
|
| 93 |
+
"loss": 0.202,
|
| 94 |
+
"step": 2400
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 1.2476007677543186,
|
| 98 |
+
"grad_norm": 0.031027931720018387,
|
| 99 |
+
"learning_rate": 0.0031633423945426832,
|
| 100 |
+
"loss": 0.1988,
|
| 101 |
+
"step": 2600
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 1.3435700575815739,
|
| 105 |
+
"grad_norm": 0.028650769963860512,
|
| 106 |
+
"learning_rate": 0.002479148036521003,
|
| 107 |
+
"loss": 0.1959,
|
| 108 |
+
"step": 2800
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 1.4395393474088292,
|
| 112 |
+
"grad_norm": 0.031158311292529106,
|
| 113 |
+
"learning_rate": 0.0018532994241758595,
|
| 114 |
+
"loss": 0.1933,
|
| 115 |
+
"step": 3000
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.5355086372360844,
|
| 119 |
+
"grad_norm": 0.031318724155426025,
|
| 120 |
+
"learning_rate": 0.0013002819792999314,
|
| 121 |
+
"loss": 0.1882,
|
| 122 |
+
"step": 3200
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.6314779270633397,
|
| 126 |
+
"grad_norm": 0.03243976831436157,
|
| 127 |
+
"learning_rate": 0.0008328954282003914,
|
| 128 |
+
"loss": 0.1887,
|
| 129 |
+
"step": 3400
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.727447216890595,
|
| 133 |
+
"grad_norm": 0.031175516545772552,
|
| 134 |
+
"learning_rate": 0.00046195754884998665,
|
| 135 |
+
"loss": 0.1856,
|
| 136 |
+
"step": 3600
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.8234165067178503,
|
| 140 |
+
"grad_norm": 0.028726134449243546,
|
| 141 |
+
"learning_rate": 0.00019605379072529185,
|
| 142 |
+
"loss": 0.1848,
|
| 143 |
+
"step": 3800
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.9193857965451055,
|
| 147 |
+
"grad_norm": 0.034339435398578644,
|
| 148 |
+
"learning_rate": 4.133856244321388e-05,
|
| 149 |
+
"loss": 0.1809,
|
| 150 |
+
"step": 4000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 2.0,
|
| 154 |
+
"step": 4168,
|
| 155 |
+
"total_flos": 4.0647058784256e+18,
|
| 156 |
+
"train_loss": 0.5545519496566275,
|
| 157 |
+
"train_runtime": 5547.6753,
|
| 158 |
+
"train_samples_per_second": 36.051,
|
| 159 |
+
"train_steps_per_second": 0.751
|
| 160 |
+
}
|
| 161 |
+
],
|
| 162 |
+
"logging_steps": 200,
|
| 163 |
+
"max_steps": 4168,
|
| 164 |
+
"num_input_tokens_seen": 0,
|
| 165 |
+
"num_train_epochs": 2,
|
| 166 |
+
"save_steps": 0,
|
| 167 |
+
"stateful_callbacks": {
|
| 168 |
+
"TrainerControl": {
|
| 169 |
+
"args": {
|
| 170 |
+
"should_epoch_stop": false,
|
| 171 |
+
"should_evaluate": false,
|
| 172 |
+
"should_log": false,
|
| 173 |
+
"should_save": true,
|
| 174 |
+
"should_training_stop": true
|
| 175 |
+
},
|
| 176 |
+
"attributes": {}
|
| 177 |
+
}
|
| 178 |
+
},
|
| 179 |
+
"total_flos": 4.0647058784256e+18,
|
| 180 |
+
"train_batch_size": 48,
|
| 181 |
+
"trial_name": null,
|
| 182 |
+
"trial_params": null
|
| 183 |
+
}
|
nl_tasks/exp100/run_ex07/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex07/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exp100/run_ex07/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exp100/run_ex07/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exp100/run_ex07/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exp100/run_ex07/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex07/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:463d7e91ab4fbd7f09a9bbaff1bf0cc91021c6cd2d4cb0c851d89a72c5e61e13
|
| 3 |
+
size 33602915
|
nl_tasks/exp100/run_ex07/trainer_state.json
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 6252,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.09596928982725528,
|
| 14 |
+
"grad_norm": 0.08396021276712418,
|
| 15 |
+
"learning_rate": 0.009988090209605933,
|
| 16 |
+
"loss": 6.13,
|
| 17 |
+
"step": 200
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.19193857965451055,
|
| 21 |
+
"grad_norm": 0.5583938956260681,
|
| 22 |
+
"learning_rate": 0.009927452190923135,
|
| 23 |
+
"loss": 4.905,
|
| 24 |
+
"step": 400
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.28790786948176583,
|
| 28 |
+
"grad_norm": 0.04631821811199188,
|
| 29 |
+
"learning_rate": 0.009816072106071381,
|
| 30 |
+
"loss": 0.8658,
|
| 31 |
+
"step": 600
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.3838771593090211,
|
| 35 |
+
"grad_norm": 0.04276253283023834,
|
| 36 |
+
"learning_rate": 0.00965509692825639,
|
| 37 |
+
"loss": 0.283,
|
| 38 |
+
"step": 800
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.4798464491362764,
|
| 42 |
+
"grad_norm": 0.03793822228908539,
|
| 43 |
+
"learning_rate": 0.009446184352531868,
|
| 44 |
+
"loss": 0.2622,
|
| 45 |
+
"step": 1000
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.5758157389635317,
|
| 49 |
+
"grad_norm": 0.03757239133119583,
|
| 50 |
+
"learning_rate": 0.009191485725137248,
|
| 51 |
+
"loss": 0.2549,
|
| 52 |
+
"step": 1200
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.6717850287907869,
|
| 56 |
+
"grad_norm": 0.03672794625163078,
|
| 57 |
+
"learning_rate": 0.00889362388929934,
|
| 58 |
+
"loss": 0.2488,
|
| 59 |
+
"step": 1400
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.7677543186180422,
|
| 63 |
+
"grad_norm": 0.03426536172628403,
|
| 64 |
+
"learning_rate": 0.008555666175638042,
|
| 65 |
+
"loss": 0.2386,
|
| 66 |
+
"step": 1600
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.8637236084452975,
|
| 70 |
+
"grad_norm": 0.0342838317155838,
|
| 71 |
+
"learning_rate": 0.008181092815316326,
|
| 72 |
+
"loss": 0.2349,
|
| 73 |
+
"step": 1800
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.9596928982725528,
|
| 77 |
+
"grad_norm": 0.03222282603383064,
|
| 78 |
+
"learning_rate": 0.007773761101210539,
|
| 79 |
+
"loss": 0.2294,
|
| 80 |
+
"step": 2000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.055662188099808,
|
| 84 |
+
"grad_norm": 0.033653076738119125,
|
| 85 |
+
"learning_rate": 0.0073378656661631705,
|
| 86 |
+
"loss": 0.2174,
|
| 87 |
+
"step": 2200
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 1.1516314779270633,
|
| 91 |
+
"grad_norm": 0.037820011377334595,
|
| 92 |
+
"learning_rate": 0.006877895287365947,
|
| 93 |
+
"loss": 0.2128,
|
| 94 |
+
"step": 2400
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 1.2476007677543186,
|
| 98 |
+
"grad_norm": 0.03313204646110535,
|
| 99 |
+
"learning_rate": 0.006398586661694321,
|
| 100 |
+
"loss": 0.211,
|
| 101 |
+
"step": 2600
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 1.3435700575815739,
|
| 105 |
+
"grad_norm": 0.038085468113422394,
|
| 106 |
+
"learning_rate": 0.0059048756280071606,
|
| 107 |
+
"loss": 0.2086,
|
| 108 |
+
"step": 2800
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 1.4395393474088292,
|
| 112 |
+
"grad_norm": 0.027709227055311203,
|
| 113 |
+
"learning_rate": 0.0054018463387160995,
|
| 114 |
+
"loss": 0.2073,
|
| 115 |
+
"step": 3000
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.5355086372360844,
|
| 119 |
+
"grad_norm": 0.03159138932824135,
|
| 120 |
+
"learning_rate": 0.004894678904047102,
|
| 121 |
+
"loss": 0.2028,
|
| 122 |
+
"step": 3200
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.6314779270633397,
|
| 126 |
+
"grad_norm": 0.032402586191892624,
|
| 127 |
+
"learning_rate": 0.004388596048144727,
|
| 128 |
+
"loss": 0.2047,
|
| 129 |
+
"step": 3400
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.727447216890595,
|
| 133 |
+
"grad_norm": 0.028337595984339714,
|
| 134 |
+
"learning_rate": 0.003888809326345519,
|
| 135 |
+
"loss": 0.2012,
|
| 136 |
+
"step": 3600
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.8234165067178503,
|
| 140 |
+
"grad_norm": 0.026909947395324707,
|
| 141 |
+
"learning_rate": 0.0034004654574658734,
|
| 142 |
+
"loss": 0.1998,
|
| 143 |
+
"step": 3800
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.9193857965451055,
|
| 147 |
+
"grad_norm": 0.031789738684892654,
|
| 148 |
+
"learning_rate": 0.002928593323765374,
|
| 149 |
+
"loss": 0.195,
|
| 150 |
+
"step": 4000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 2.015355086372361,
|
| 154 |
+
"grad_norm": 0.029551630839705467,
|
| 155 |
+
"learning_rate": 0.002478052184371007,
|
| 156 |
+
"loss": 0.1912,
|
| 157 |
+
"step": 4200
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 2.111324376199616,
|
| 161 |
+
"grad_norm": 0.03310905396938324,
|
| 162 |
+
"learning_rate": 0.002053481635451576,
|
| 163 |
+
"loss": 0.1712,
|
| 164 |
+
"step": 4400
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 2.2072936660268714,
|
| 168 |
+
"grad_norm": 0.02764611691236496,
|
| 169 |
+
"learning_rate": 0.0016592538324439927,
|
| 170 |
+
"loss": 0.1706,
|
| 171 |
+
"step": 4600
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 2.3032629558541267,
|
| 175 |
+
"grad_norm": 0.027012605220079422,
|
| 176 |
+
"learning_rate": 0.0012994284663388062,
|
| 177 |
+
"loss": 0.1718,
|
| 178 |
+
"step": 4800
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 2.399232245681382,
|
| 182 |
+
"grad_norm": 0.028096886351704597,
|
| 183 |
+
"learning_rate": 0.0009777109576715644,
|
| 184 |
+
"loss": 0.1659,
|
| 185 |
+
"step": 5000
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 2.495201535508637,
|
| 189 |
+
"grad_norm": 0.030632272362709045,
|
| 190 |
+
"learning_rate": 0.0006974142987311794,
|
| 191 |
+
"loss": 0.169,
|
| 192 |
+
"step": 5200
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 2.5911708253358925,
|
| 196 |
+
"grad_norm": 0.028473207727074623,
|
| 197 |
+
"learning_rate": 0.0004614249369277501,
|
| 198 |
+
"loss": 0.1681,
|
| 199 |
+
"step": 5400
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 2.6871401151631478,
|
| 203 |
+
"grad_norm": 0.03663257881999016,
|
| 204 |
+
"learning_rate": 0.0002721730506471498,
|
| 205 |
+
"loss": 0.1651,
|
| 206 |
+
"step": 5600
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 2.783109404990403,
|
| 210 |
+
"grad_norm": 0.02837834507226944,
|
| 211 |
+
"learning_rate": 0.00013160752368655492,
|
| 212 |
+
"loss": 0.1629,
|
| 213 |
+
"step": 5800
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 2.8790786948176583,
|
| 217 |
+
"grad_norm": 0.03114727884531021,
|
| 218 |
+
"learning_rate": 4.11758759799491e-05,
|
| 219 |
+
"loss": 0.1642,
|
| 220 |
+
"step": 6000
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 2.9750479846449136,
|
| 224 |
+
"grad_norm": 0.028946418315172195,
|
| 225 |
+
"learning_rate": 1.809357283573676e-06,
|
| 226 |
+
"loss": 0.1653,
|
| 227 |
+
"step": 6200
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 3.0,
|
| 231 |
+
"step": 6252,
|
| 232 |
+
"total_flos": 6.0970588176384e+18,
|
| 233 |
+
"train_loss": 0.5637188162166037,
|
| 234 |
+
"train_runtime": 8208.1422,
|
| 235 |
+
"train_samples_per_second": 36.549,
|
| 236 |
+
"train_steps_per_second": 0.762
|
| 237 |
+
}
|
| 238 |
+
],
|
| 239 |
+
"logging_steps": 200,
|
| 240 |
+
"max_steps": 6252,
|
| 241 |
+
"num_input_tokens_seen": 0,
|
| 242 |
+
"num_train_epochs": 3,
|
| 243 |
+
"save_steps": 0,
|
| 244 |
+
"stateful_callbacks": {
|
| 245 |
+
"TrainerControl": {
|
| 246 |
+
"args": {
|
| 247 |
+
"should_epoch_stop": false,
|
| 248 |
+
"should_evaluate": false,
|
| 249 |
+
"should_log": false,
|
| 250 |
+
"should_save": true,
|
| 251 |
+
"should_training_stop": true
|
| 252 |
+
},
|
| 253 |
+
"attributes": {}
|
| 254 |
+
}
|
| 255 |
+
},
|
| 256 |
+
"total_flos": 6.0970588176384e+18,
|
| 257 |
+
"train_batch_size": 48,
|
| 258 |
+
"trial_name": null,
|
| 259 |
+
"trial_params": null
|
| 260 |
+
}
|
nl_tasks/exp100/run_ex08/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exp100/run_ex08/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exp100/run_ex08/trainer_state.json
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 4168,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.09596928982725528,
|
| 14 |
+
"grad_norm": 0.05143404379487038,
|
| 15 |
+
"learning_rate": 0.019928633670077564,
|
| 16 |
+
"loss": 0.7492,
|
| 17 |
+
"step": 200
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.19193857965451055,
|
| 21 |
+
"grad_norm": 0.049090251326560974,
|
| 22 |
+
"learning_rate": 0.01963282596108034,
|
| 23 |
+
"loss": 0.2822,
|
| 24 |
+
"step": 400
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.28790786948176583,
|
| 28 |
+
"grad_norm": 0.03802068158984184,
|
| 29 |
+
"learning_rate": 0.01911406409935458,
|
| 30 |
+
"loss": 0.2606,
|
| 31 |
+
"step": 600
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.3838771593090211,
|
| 35 |
+
"grad_norm": 0.03355047479271889,
|
| 36 |
+
"learning_rate": 0.01838435495721545,
|
| 37 |
+
"loss": 0.2506,
|
| 38 |
+
"step": 800
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.4798464491362764,
|
| 42 |
+
"grad_norm": 0.03283309563994408,
|
| 43 |
+
"learning_rate": 0.017460587834249728,
|
| 44 |
+
"loss": 0.2402,
|
| 45 |
+
"step": 1000
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.5758157389635317,
|
| 49 |
+
"grad_norm": 0.028441881760954857,
|
| 50 |
+
"learning_rate": 0.016364143550276163,
|
| 51 |
+
"loss": 0.2378,
|
| 52 |
+
"step": 1200
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.6717850287907869,
|
| 56 |
+
"grad_norm": 0.025979651138186455,
|
| 57 |
+
"learning_rate": 0.015120399580952997,
|
| 58 |
+
"loss": 0.2336,
|
| 59 |
+
"step": 1400
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.7677543186180422,
|
| 63 |
+
"grad_norm": 0.023276863619685173,
|
| 64 |
+
"learning_rate": 0.013758142689791999,
|
| 65 |
+
"loss": 0.2249,
|
| 66 |
+
"step": 1600
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.8637236084452975,
|
| 70 |
+
"grad_norm": 0.02259671501815319,
|
| 71 |
+
"learning_rate": 0.012308902651357206,
|
| 72 |
+
"loss": 0.2221,
|
| 73 |
+
"step": 1800
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.9596928982725528,
|
| 77 |
+
"grad_norm": 0.01977524533867836,
|
| 78 |
+
"learning_rate": 0.010806222486790591,
|
| 79 |
+
"loss": 0.2172,
|
| 80 |
+
"step": 2000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.055662188099808,
|
| 84 |
+
"grad_norm": 0.02073797583580017,
|
| 85 |
+
"learning_rate": 0.009284882102244985,
|
| 86 |
+
"loss": 0.2027,
|
| 87 |
+
"step": 2200
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 1.1516314779270633,
|
| 91 |
+
"grad_norm": 0.020284440368413925,
|
| 92 |
+
"learning_rate": 0.00778009329930577,
|
| 93 |
+
"loss": 0.1956,
|
| 94 |
+
"step": 2400
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 1.2476007677543186,
|
| 98 |
+
"grad_norm": 0.02195167914032936,
|
| 99 |
+
"learning_rate": 0.0063266847890853664,
|
| 100 |
+
"loss": 0.1924,
|
| 101 |
+
"step": 2600
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 1.3435700575815739,
|
| 105 |
+
"grad_norm": 0.01651054248213768,
|
| 106 |
+
"learning_rate": 0.004958296073042006,
|
| 107 |
+
"loss": 0.1901,
|
| 108 |
+
"step": 2800
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 1.4395393474088292,
|
| 112 |
+
"grad_norm": 0.021048910915851593,
|
| 113 |
+
"learning_rate": 0.003706598848351719,
|
| 114 |
+
"loss": 0.1877,
|
| 115 |
+
"step": 3000
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.5355086372360844,
|
| 119 |
+
"grad_norm": 0.01921105571091175,
|
| 120 |
+
"learning_rate": 0.002600563958599863,
|
| 121 |
+
"loss": 0.1824,
|
| 122 |
+
"step": 3200
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.6314779270633397,
|
| 126 |
+
"grad_norm": 0.02395492233335972,
|
| 127 |
+
"learning_rate": 0.0016657908564007829,
|
| 128 |
+
"loss": 0.1825,
|
| 129 |
+
"step": 3400
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.727447216890595,
|
| 133 |
+
"grad_norm": 0.01738915778696537,
|
| 134 |
+
"learning_rate": 0.0009239150976999733,
|
| 135 |
+
"loss": 0.1793,
|
| 136 |
+
"step": 3600
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.8234165067178503,
|
| 140 |
+
"grad_norm": 0.018468189984560013,
|
| 141 |
+
"learning_rate": 0.0003921075814505837,
|
| 142 |
+
"loss": 0.1784,
|
| 143 |
+
"step": 3800
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.9193857965451055,
|
| 147 |
+
"grad_norm": 0.017226792871952057,
|
| 148 |
+
"learning_rate": 8.267712488642776e-05,
|
| 149 |
+
"loss": 0.1743,
|
| 150 |
+
"step": 4000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 2.0,
|
| 154 |
+
"step": 4168,
|
| 155 |
+
"total_flos": 4.0647058784256e+18,
|
| 156 |
+
"train_loss": 0.2365957404738882,
|
| 157 |
+
"train_runtime": 5474.4546,
|
| 158 |
+
"train_samples_per_second": 36.533,
|
| 159 |
+
"train_steps_per_second": 0.761
|
| 160 |
+
}
|
| 161 |
+
],
|
| 162 |
+
"logging_steps": 200,
|
| 163 |
+
"max_steps": 4168,
|
| 164 |
+
"num_input_tokens_seen": 0,
|
| 165 |
+
"num_train_epochs": 2,
|
| 166 |
+
"save_steps": 0,
|
| 167 |
+
"stateful_callbacks": {
|
| 168 |
+
"TrainerControl": {
|
| 169 |
+
"args": {
|
| 170 |
+
"should_epoch_stop": false,
|
| 171 |
+
"should_evaluate": false,
|
| 172 |
+
"should_log": false,
|
| 173 |
+
"should_save": true,
|
| 174 |
+
"should_training_stop": true
|
| 175 |
+
},
|
| 176 |
+
"attributes": {}
|
| 177 |
+
}
|
| 178 |
+
},
|
| 179 |
+
"total_flos": 4.0647058784256e+18,
|
| 180 |
+
"train_batch_size": 48,
|
| 181 |
+
"trial_name": null,
|
| 182 |
+
"trial_params": null
|
| 183 |
+
}
|