Batch upload part 18
Browse files- nl_tasks/exp100/run_ex08/ft/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex08/ft/tokenizer.json +0 -0
- nl_tasks/exp100/run_ex08/ft/tokenizer.model +3 -0
- nl_tasks/exp100/run_ex08/ft2/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex08/ft2/adapter_model.bin +3 -0
- nl_tasks/exp100/run_ex09/ft/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex09/ft/special_tokens_map.json +24 -0
- nl_tasks/exp100/run_ex09/ft/tokenizer.json +0 -0
- nl_tasks/exp100/run_ex09/ft/tokenizer.model +3 -0
- nl_tasks/exp100/run_ex09/ft/tokenizer_config.json +43 -0
- nl_tasks/exp100/run_ex09/ft2/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex09/ft2/adapter_model.bin +3 -0
- nl_tasks/exp100/run_ex09/trainer_state.json +260 -0
- nl_tasks/exp100/run_ex10/ft/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex10/ft/special_tokens_map.json +24 -0
- nl_tasks/exp100/run_ex10/ft/tokenizer.json +0 -0
- nl_tasks/exp100/run_ex10/ft/tokenizer.model +3 -0
- nl_tasks/exp100/run_ex10/ft/tokenizer_config.json +43 -0
- nl_tasks/exp100/run_ex10/ft2/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex10/ft2/adapter_model.bin +3 -0
- nl_tasks/exp100/run_ex10/trainer_state.json +183 -0
- nl_tasks/exp100/run_ex11/ft/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex11/ft/special_tokens_map.json +24 -0
- nl_tasks/exp100/run_ex11/ft/tokenizer.json +0 -0
- nl_tasks/exp100/run_ex11/ft/tokenizer.model +3 -0
- nl_tasks/exp100/run_ex11/ft/tokenizer_config.json +43 -0
- nl_tasks/exp100/run_ex11/ft2/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex11/ft2/adapter_model.bin +3 -0
- nl_tasks/exp100/run_ex11/trainer_state.json +183 -0
- nl_tasks/exp100/run_ex12/ft/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex12/ft/special_tokens_map.json +24 -0
- nl_tasks/exp100/run_ex12/ft/tokenizer.json +0 -0
- nl_tasks/exp100/run_ex12/ft/tokenizer.model +3 -0
- nl_tasks/exp100/run_ex12/ft/tokenizer_config.json +43 -0
- nl_tasks/exp100/run_ex12/ft2/adapter_config.json +18 -0
- nl_tasks/exp100/run_ex12/ft2/adapter_model.bin +3 -0
- nl_tasks/exp100/run_ex12/trainer_state.json +260 -0
- nl_tasks/expsBOFT/seed43/trainer_state.json +218 -0
- nl_tasks/expsBOFT/seed44/ft/special_tokens_map.json +24 -0
- nl_tasks/expsBOFT/seed44/ft/tokenizer.json +0 -0
- nl_tasks/expsBOFT/seed44/ft/tokenizer.model +3 -0
- nl_tasks/expsBOFT/seed44/ft/tokenizer_config.json +43 -0
- nl_tasks/expsBOFT/seed44/ft2/README.md +205 -0
- nl_tasks/expsBOFT/seed44/ft2/adapter_config.json +27 -0
- nl_tasks/expsBOFT/seed44/ft2/adapter_model.safetensors +3 -0
- nl_tasks/expsBOFT/seed44/trainer_state.json +218 -0
- nl_tasks/inference/MATH_infer.py +132 -0
- nl_tasks/inference/grader.py +141 -0
- nl_tasks/inference/gsm8k_infer.py +157 -0
- nl_tasks/inference/util.py +253 -0
nl_tasks/exp100/run_ex08/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex08/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exp100/run_ex08/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exp100/run_ex08/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex08/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc7597ccf1305d10c57f61a9c789f5d7a5cc15cc8e54fdc0806057df1fe03a3b
|
| 3 |
+
size 33602915
|
nl_tasks/exp100/run_ex09/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex09/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exp100/run_ex09/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exp100/run_ex09/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exp100/run_ex09/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exp100/run_ex09/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex09/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e041e22247e003d3fa1f62f968d3096e9383222bfc93bfd5deee072308dba1e8
|
| 3 |
+
size 33602915
|
nl_tasks/exp100/run_ex09/trainer_state.json
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 6252,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.09596928982725528,
|
| 14 |
+
"grad_norm": 0.06508654356002808,
|
| 15 |
+
"learning_rate": 0.019976180419211866,
|
| 16 |
+
"loss": 0.5532,
|
| 17 |
+
"step": 200
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.19193857965451055,
|
| 21 |
+
"grad_norm": 0.0456949919462204,
|
| 22 |
+
"learning_rate": 0.01985490438184627,
|
| 23 |
+
"loss": 0.283,
|
| 24 |
+
"step": 400
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.28790786948176583,
|
| 28 |
+
"grad_norm": 0.0334312878549099,
|
| 29 |
+
"learning_rate": 0.019632144212142762,
|
| 30 |
+
"loss": 0.2612,
|
| 31 |
+
"step": 600
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.3838771593090211,
|
| 35 |
+
"grad_norm": 0.04754582419991493,
|
| 36 |
+
"learning_rate": 0.01931019385651278,
|
| 37 |
+
"loss": 0.2547,
|
| 38 |
+
"step": 800
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.4798464491362764,
|
| 42 |
+
"grad_norm": 0.02298681065440178,
|
| 43 |
+
"learning_rate": 0.018892368705063736,
|
| 44 |
+
"loss": 0.242,
|
| 45 |
+
"step": 1000
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.5758157389635317,
|
| 49 |
+
"grad_norm": 0.021144121885299683,
|
| 50 |
+
"learning_rate": 0.018382971450274496,
|
| 51 |
+
"loss": 0.2388,
|
| 52 |
+
"step": 1200
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.6717850287907869,
|
| 56 |
+
"grad_norm": 0.023693973198533058,
|
| 57 |
+
"learning_rate": 0.01778724777859868,
|
| 58 |
+
"loss": 0.2355,
|
| 59 |
+
"step": 1400
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.7677543186180422,
|
| 63 |
+
"grad_norm": 0.019977454096078873,
|
| 64 |
+
"learning_rate": 0.017111332351276085,
|
| 65 |
+
"loss": 0.2275,
|
| 66 |
+
"step": 1600
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.8637236084452975,
|
| 70 |
+
"grad_norm": 0.019784899428486824,
|
| 71 |
+
"learning_rate": 0.01636218563063265,
|
| 72 |
+
"loss": 0.2254,
|
| 73 |
+
"step": 1800
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.9596928982725528,
|
| 77 |
+
"grad_norm": 0.020388498902320862,
|
| 78 |
+
"learning_rate": 0.015547522202421078,
|
| 79 |
+
"loss": 0.2216,
|
| 80 |
+
"step": 2000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.055662188099808,
|
| 84 |
+
"grad_norm": 0.017220880836248398,
|
| 85 |
+
"learning_rate": 0.014675731332326341,
|
| 86 |
+
"loss": 0.2087,
|
| 87 |
+
"step": 2200
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 1.1516314779270633,
|
| 91 |
+
"grad_norm": 0.018207907676696777,
|
| 92 |
+
"learning_rate": 0.013755790574731894,
|
| 93 |
+
"loss": 0.2038,
|
| 94 |
+
"step": 2400
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 1.2476007677543186,
|
| 98 |
+
"grad_norm": 0.016460182145237923,
|
| 99 |
+
"learning_rate": 0.012797173323388642,
|
| 100 |
+
"loss": 0.202,
|
| 101 |
+
"step": 2600
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 1.3435700575815739,
|
| 105 |
+
"grad_norm": 0.013017100282013416,
|
| 106 |
+
"learning_rate": 0.011809751256014321,
|
| 107 |
+
"loss": 0.2012,
|
| 108 |
+
"step": 2800
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 1.4395393474088292,
|
| 112 |
+
"grad_norm": 0.018365703523159027,
|
| 113 |
+
"learning_rate": 0.010803692677432199,
|
| 114 |
+
"loss": 0.2005,
|
| 115 |
+
"step": 3000
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.5355086372360844,
|
| 119 |
+
"grad_norm": 0.01587655022740364,
|
| 120 |
+
"learning_rate": 0.009789357808094205,
|
| 121 |
+
"loss": 0.1964,
|
| 122 |
+
"step": 3200
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.6314779270633397,
|
| 126 |
+
"grad_norm": 0.01589033380150795,
|
| 127 |
+
"learning_rate": 0.008777192096289453,
|
| 128 |
+
"loss": 0.198,
|
| 129 |
+
"step": 3400
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.727447216890595,
|
| 133 |
+
"grad_norm": 0.014157130382955074,
|
| 134 |
+
"learning_rate": 0.007777618652691038,
|
| 135 |
+
"loss": 0.1951,
|
| 136 |
+
"step": 3600
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.8234165067178503,
|
| 140 |
+
"grad_norm": 0.013822129927575588,
|
| 141 |
+
"learning_rate": 0.006800930914931747,
|
| 142 |
+
"loss": 0.1941,
|
| 143 |
+
"step": 3800
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.9193857965451055,
|
| 147 |
+
"grad_norm": 0.013463828712701797,
|
| 148 |
+
"learning_rate": 0.005857186647530748,
|
| 149 |
+
"loss": 0.1892,
|
| 150 |
+
"step": 4000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 2.015355086372361,
|
| 154 |
+
"grad_norm": 0.015684494748711586,
|
| 155 |
+
"learning_rate": 0.004956104368742014,
|
| 156 |
+
"loss": 0.1853,
|
| 157 |
+
"step": 4200
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 2.111324376199616,
|
| 161 |
+
"grad_norm": 0.01606130413711071,
|
| 162 |
+
"learning_rate": 0.004106963270903152,
|
| 163 |
+
"loss": 0.1642,
|
| 164 |
+
"step": 4400
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 2.2072936660268714,
|
| 168 |
+
"grad_norm": 0.015006215311586857,
|
| 169 |
+
"learning_rate": 0.0033185076648879854,
|
| 170 |
+
"loss": 0.164,
|
| 171 |
+
"step": 4600
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 2.3032629558541267,
|
| 175 |
+
"grad_norm": 0.013956602662801743,
|
| 176 |
+
"learning_rate": 0.0025988569326776123,
|
| 177 |
+
"loss": 0.165,
|
| 178 |
+
"step": 4800
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 2.399232245681382,
|
| 182 |
+
"grad_norm": 0.01500143762677908,
|
| 183 |
+
"learning_rate": 0.0019554219153431287,
|
| 184 |
+
"loss": 0.1593,
|
| 185 |
+
"step": 5000
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 2.495201535508637,
|
| 189 |
+
"grad_norm": 0.016031745821237564,
|
| 190 |
+
"learning_rate": 0.0013948285974623588,
|
| 191 |
+
"loss": 0.1621,
|
| 192 |
+
"step": 5200
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 2.5911708253358925,
|
| 196 |
+
"grad_norm": 0.01416528970003128,
|
| 197 |
+
"learning_rate": 0.0009228498738555002,
|
| 198 |
+
"loss": 0.161,
|
| 199 |
+
"step": 5400
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 2.6871401151631478,
|
| 203 |
+
"grad_norm": 0.01726922020316124,
|
| 204 |
+
"learning_rate": 0.0005443461012942996,
|
| 205 |
+
"loss": 0.1576,
|
| 206 |
+
"step": 5600
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 2.783109404990403,
|
| 210 |
+
"grad_norm": 0.015128469094634056,
|
| 211 |
+
"learning_rate": 0.00026321504737310985,
|
| 212 |
+
"loss": 0.1558,
|
| 213 |
+
"step": 5800
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 2.8790786948176583,
|
| 217 |
+
"grad_norm": 0.016625599935650826,
|
| 218 |
+
"learning_rate": 8.23517519598982e-05,
|
| 219 |
+
"loss": 0.1565,
|
| 220 |
+
"step": 6000
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 2.9750479846449136,
|
| 224 |
+
"grad_norm": 0.015019167214632034,
|
| 225 |
+
"learning_rate": 3.618714567147352e-06,
|
| 226 |
+
"loss": 0.1576,
|
| 227 |
+
"step": 6200
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 3.0,
|
| 231 |
+
"step": 6252,
|
| 232 |
+
"total_flos": 6.0970588176384e+18,
|
| 233 |
+
"train_loss": 0.2099078561889004,
|
| 234 |
+
"train_runtime": 8160.2371,
|
| 235 |
+
"train_samples_per_second": 36.764,
|
| 236 |
+
"train_steps_per_second": 0.766
|
| 237 |
+
}
|
| 238 |
+
],
|
| 239 |
+
"logging_steps": 200,
|
| 240 |
+
"max_steps": 6252,
|
| 241 |
+
"num_input_tokens_seen": 0,
|
| 242 |
+
"num_train_epochs": 3,
|
| 243 |
+
"save_steps": 0,
|
| 244 |
+
"stateful_callbacks": {
|
| 245 |
+
"TrainerControl": {
|
| 246 |
+
"args": {
|
| 247 |
+
"should_epoch_stop": false,
|
| 248 |
+
"should_evaluate": false,
|
| 249 |
+
"should_log": false,
|
| 250 |
+
"should_save": true,
|
| 251 |
+
"should_training_stop": true
|
| 252 |
+
},
|
| 253 |
+
"attributes": {}
|
| 254 |
+
}
|
| 255 |
+
},
|
| 256 |
+
"total_flos": 6.0970588176384e+18,
|
| 257 |
+
"train_batch_size": 48,
|
| 258 |
+
"trial_name": null,
|
| 259 |
+
"trial_params": null
|
| 260 |
+
}
|
nl_tasks/exp100/run_ex10/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex10/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exp100/run_ex10/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exp100/run_ex10/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exp100/run_ex10/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exp100/run_ex10/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex10/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff09e3f976b0f890a445477281ac6c563f8b2b11869aff99580213720ae3ec8f
|
| 3 |
+
size 33602915
|
nl_tasks/exp100/run_ex10/trainer_state.json
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 4168,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.09596928982725528,
|
| 14 |
+
"grad_norm": 0.17983072996139526,
|
| 15 |
+
"learning_rate": 0.029892950505116346,
|
| 16 |
+
"loss": 5.4284,
|
| 17 |
+
"step": 200
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.19193857965451055,
|
| 21 |
+
"grad_norm": 0.02686592936515808,
|
| 22 |
+
"learning_rate": 0.02944923894162051,
|
| 23 |
+
"loss": 0.3284,
|
| 24 |
+
"step": 400
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.28790786948176583,
|
| 28 |
+
"grad_norm": 0.02659301459789276,
|
| 29 |
+
"learning_rate": 0.028671096149031867,
|
| 30 |
+
"loss": 0.2782,
|
| 31 |
+
"step": 600
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.3838771593090211,
|
| 35 |
+
"grad_norm": 0.021944062784314156,
|
| 36 |
+
"learning_rate": 0.027576532435823177,
|
| 37 |
+
"loss": 0.2639,
|
| 38 |
+
"step": 800
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.4798464491362764,
|
| 42 |
+
"grad_norm": 0.019736966118216515,
|
| 43 |
+
"learning_rate": 0.02619088175137459,
|
| 44 |
+
"loss": 0.2518,
|
| 45 |
+
"step": 1000
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.5758157389635317,
|
| 49 |
+
"grad_norm": 0.020629985257983208,
|
| 50 |
+
"learning_rate": 0.024546215325414244,
|
| 51 |
+
"loss": 0.2478,
|
| 52 |
+
"step": 1200
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.6717850287907869,
|
| 56 |
+
"grad_norm": 0.022958872839808464,
|
| 57 |
+
"learning_rate": 0.022680599371429494,
|
| 58 |
+
"loss": 0.2429,
|
| 59 |
+
"step": 1400
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.7677543186180422,
|
| 63 |
+
"grad_norm": 0.020077615976333618,
|
| 64 |
+
"learning_rate": 0.020637214034687996,
|
| 65 |
+
"loss": 0.2336,
|
| 66 |
+
"step": 1600
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.8637236084452975,
|
| 70 |
+
"grad_norm": 0.018682507798075676,
|
| 71 |
+
"learning_rate": 0.018463353977035808,
|
| 72 |
+
"loss": 0.2302,
|
| 73 |
+
"step": 1800
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.9596928982725528,
|
| 77 |
+
"grad_norm": 0.02266324870288372,
|
| 78 |
+
"learning_rate": 0.016209333730185887,
|
| 79 |
+
"loss": 0.225,
|
| 80 |
+
"step": 2000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.055662188099808,
|
| 84 |
+
"grad_norm": 0.01823570765554905,
|
| 85 |
+
"learning_rate": 0.013927323153367477,
|
| 86 |
+
"loss": 0.2111,
|
| 87 |
+
"step": 2200
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 1.1516314779270633,
|
| 91 |
+
"grad_norm": 0.02119363099336624,
|
| 92 |
+
"learning_rate": 0.011670139948958654,
|
| 93 |
+
"loss": 0.2052,
|
| 94 |
+
"step": 2400
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 1.2476007677543186,
|
| 98 |
+
"grad_norm": 0.018385590985417366,
|
| 99 |
+
"learning_rate": 0.009490027183628048,
|
| 100 |
+
"loss": 0.2016,
|
| 101 |
+
"step": 2600
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 1.3435700575815739,
|
| 105 |
+
"grad_norm": 0.01590455323457718,
|
| 106 |
+
"learning_rate": 0.0074374441095630085,
|
| 107 |
+
"loss": 0.1987,
|
| 108 |
+
"step": 2800
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 1.4395393474088292,
|
| 112 |
+
"grad_norm": 0.017842255532741547,
|
| 113 |
+
"learning_rate": 0.0055598982725275775,
|
| 114 |
+
"loss": 0.1957,
|
| 115 |
+
"step": 3000
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.5355086372360844,
|
| 119 |
+
"grad_norm": 0.018860826268792152,
|
| 120 |
+
"learning_rate": 0.0039008459378997943,
|
| 121 |
+
"loss": 0.1901,
|
| 122 |
+
"step": 3200
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.6314779270633397,
|
| 126 |
+
"grad_norm": 0.018248997628688812,
|
| 127 |
+
"learning_rate": 0.002498686284601174,
|
| 128 |
+
"loss": 0.1899,
|
| 129 |
+
"step": 3400
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.727447216890595,
|
| 133 |
+
"grad_norm": 0.01603817380964756,
|
| 134 |
+
"learning_rate": 0.0013858726465499599,
|
| 135 |
+
"loss": 0.1865,
|
| 136 |
+
"step": 3600
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.8234165067178503,
|
| 140 |
+
"grad_norm": 0.016947340220212936,
|
| 141 |
+
"learning_rate": 0.0005881613721758754,
|
| 142 |
+
"loss": 0.1848,
|
| 143 |
+
"step": 3800
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.9193857965451055,
|
| 147 |
+
"grad_norm": 0.018619216978549957,
|
| 148 |
+
"learning_rate": 0.00012401568732964163,
|
| 149 |
+
"loss": 0.1808,
|
| 150 |
+
"step": 4000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 2.0,
|
| 154 |
+
"step": 4168,
|
| 155 |
+
"total_flos": 4.0647058784256e+18,
|
| 156 |
+
"train_loss": 0.47151951963750505,
|
| 157 |
+
"train_runtime": 5453.3681,
|
| 158 |
+
"train_samples_per_second": 36.675,
|
| 159 |
+
"train_steps_per_second": 0.764
|
| 160 |
+
}
|
| 161 |
+
],
|
| 162 |
+
"logging_steps": 200,
|
| 163 |
+
"max_steps": 4168,
|
| 164 |
+
"num_input_tokens_seen": 0,
|
| 165 |
+
"num_train_epochs": 2,
|
| 166 |
+
"save_steps": 0,
|
| 167 |
+
"stateful_callbacks": {
|
| 168 |
+
"TrainerControl": {
|
| 169 |
+
"args": {
|
| 170 |
+
"should_epoch_stop": false,
|
| 171 |
+
"should_evaluate": false,
|
| 172 |
+
"should_log": false,
|
| 173 |
+
"should_save": true,
|
| 174 |
+
"should_training_stop": true
|
| 175 |
+
},
|
| 176 |
+
"attributes": {}
|
| 177 |
+
}
|
| 178 |
+
},
|
| 179 |
+
"total_flos": 4.0647058784256e+18,
|
| 180 |
+
"train_batch_size": 48,
|
| 181 |
+
"trial_name": null,
|
| 182 |
+
"trial_params": null
|
| 183 |
+
}
|
nl_tasks/exp100/run_ex11/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex11/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exp100/run_ex11/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exp100/run_ex11/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exp100/run_ex11/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exp100/run_ex11/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex11/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:669642b5dfd24d7a899a1a21a69a5a9cf6d0170c2609f2a54c664123864585da
|
| 3 |
+
size 33602915
|
nl_tasks/exp100/run_ex11/trainer_state.json
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 4168,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.09596928982725528,
|
| 14 |
+
"grad_norm": 0.09253966063261032,
|
| 15 |
+
"learning_rate": 0.007971453468031025,
|
| 16 |
+
"loss": 3.4503,
|
| 17 |
+
"step": 200
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.19193857965451055,
|
| 21 |
+
"grad_norm": 0.06041109189391136,
|
| 22 |
+
"learning_rate": 0.007853130384432137,
|
| 23 |
+
"loss": 0.2986,
|
| 24 |
+
"step": 400
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.28790786948176583,
|
| 28 |
+
"grad_norm": 0.05962882563471794,
|
| 29 |
+
"learning_rate": 0.007645625639741832,
|
| 30 |
+
"loss": 0.2678,
|
| 31 |
+
"step": 600
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.3838771593090211,
|
| 35 |
+
"grad_norm": 0.05638430267572403,
|
| 36 |
+
"learning_rate": 0.007353741982886181,
|
| 37 |
+
"loss": 0.2562,
|
| 38 |
+
"step": 800
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.4798464491362764,
|
| 42 |
+
"grad_norm": 0.043786656111478806,
|
| 43 |
+
"learning_rate": 0.0069842351336998915,
|
| 44 |
+
"loss": 0.2442,
|
| 45 |
+
"step": 1000
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.5758157389635317,
|
| 49 |
+
"grad_norm": 0.04289592057466507,
|
| 50 |
+
"learning_rate": 0.006545657420110465,
|
| 51 |
+
"loss": 0.2413,
|
| 52 |
+
"step": 1200
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.6717850287907869,
|
| 56 |
+
"grad_norm": 0.044572457671165466,
|
| 57 |
+
"learning_rate": 0.006048159832381199,
|
| 58 |
+
"loss": 0.237,
|
| 59 |
+
"step": 1400
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.7677543186180422,
|
| 63 |
+
"grad_norm": 0.03955984488129616,
|
| 64 |
+
"learning_rate": 0.0055032570759168,
|
| 65 |
+
"loss": 0.2277,
|
| 66 |
+
"step": 1600
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.8637236084452975,
|
| 70 |
+
"grad_norm": 0.03934319689869881,
|
| 71 |
+
"learning_rate": 0.004923561060542882,
|
| 72 |
+
"loss": 0.2243,
|
| 73 |
+
"step": 1800
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.9596928982725528,
|
| 77 |
+
"grad_norm": 0.034399278461933136,
|
| 78 |
+
"learning_rate": 0.004322488994716237,
|
| 79 |
+
"loss": 0.2193,
|
| 80 |
+
"step": 2000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.055662188099808,
|
| 84 |
+
"grad_norm": 0.037286121398210526,
|
| 85 |
+
"learning_rate": 0.003713952840897994,
|
| 86 |
+
"loss": 0.2051,
|
| 87 |
+
"step": 2200
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 1.1516314779270633,
|
| 91 |
+
"grad_norm": 0.03788420185446739,
|
| 92 |
+
"learning_rate": 0.0031120373197223083,
|
| 93 |
+
"loss": 0.1981,
|
| 94 |
+
"step": 2400
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 1.2476007677543186,
|
| 98 |
+
"grad_norm": 0.036435652524232864,
|
| 99 |
+
"learning_rate": 0.0025306739156341464,
|
| 100 |
+
"loss": 0.1947,
|
| 101 |
+
"step": 2600
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 1.3435700575815739,
|
| 105 |
+
"grad_norm": 0.03084568865597248,
|
| 106 |
+
"learning_rate": 0.0019833184292168023,
|
| 107 |
+
"loss": 0.1924,
|
| 108 |
+
"step": 2800
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 1.4395393474088292,
|
| 112 |
+
"grad_norm": 0.03642988204956055,
|
| 113 |
+
"learning_rate": 0.0014826395393406876,
|
| 114 |
+
"loss": 0.1901,
|
| 115 |
+
"step": 3000
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.5355086372360844,
|
| 119 |
+
"grad_norm": 0.03472171723842621,
|
| 120 |
+
"learning_rate": 0.0010402255834399453,
|
| 121 |
+
"loss": 0.1851,
|
| 122 |
+
"step": 3200
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.6314779270633397,
|
| 126 |
+
"grad_norm": 0.03253506124019623,
|
| 127 |
+
"learning_rate": 0.0006663163425603131,
|
| 128 |
+
"loss": 0.1857,
|
| 129 |
+
"step": 3400
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.727447216890595,
|
| 133 |
+
"grad_norm": 0.036016546189785004,
|
| 134 |
+
"learning_rate": 0.0003695660390799893,
|
| 135 |
+
"loss": 0.1827,
|
| 136 |
+
"step": 3600
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.8234165067178503,
|
| 140 |
+
"grad_norm": 0.032651085406541824,
|
| 141 |
+
"learning_rate": 0.00015684303258023348,
|
| 142 |
+
"loss": 0.1821,
|
| 143 |
+
"step": 3800
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.9193857965451055,
|
| 147 |
+
"grad_norm": 0.0359911285340786,
|
| 148 |
+
"learning_rate": 3.30708499545711e-05,
|
| 149 |
+
"loss": 0.1782,
|
| 150 |
+
"step": 4000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 2.0,
|
| 154 |
+
"step": 4168,
|
| 155 |
+
"total_flos": 4.0647058784256e+18,
|
| 156 |
+
"train_loss": 0.3700037432723677,
|
| 157 |
+
"train_runtime": 5461.8246,
|
| 158 |
+
"train_samples_per_second": 36.618,
|
| 159 |
+
"train_steps_per_second": 0.763
|
| 160 |
+
}
|
| 161 |
+
],
|
| 162 |
+
"logging_steps": 200,
|
| 163 |
+
"max_steps": 4168,
|
| 164 |
+
"num_input_tokens_seen": 0,
|
| 165 |
+
"num_train_epochs": 2,
|
| 166 |
+
"save_steps": 0,
|
| 167 |
+
"stateful_callbacks": {
|
| 168 |
+
"TrainerControl": {
|
| 169 |
+
"args": {
|
| 170 |
+
"should_epoch_stop": false,
|
| 171 |
+
"should_evaluate": false,
|
| 172 |
+
"should_log": false,
|
| 173 |
+
"should_save": true,
|
| 174 |
+
"should_training_stop": true
|
| 175 |
+
},
|
| 176 |
+
"attributes": {}
|
| 177 |
+
}
|
| 178 |
+
},
|
| 179 |
+
"total_flos": 4.0647058784256e+18,
|
| 180 |
+
"train_batch_size": 48,
|
| 181 |
+
"trial_name": null,
|
| 182 |
+
"trial_params": null
|
| 183 |
+
}
|
nl_tasks/exp100/run_ex12/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex12/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exp100/run_ex12/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exp100/run_ex12/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exp100/run_ex12/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exp100/run_ex12/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exp100/run_ex12/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:86f326abe5a1f356d70a24ce7bab2ee7dd2bcb059d6f2282d04dc2f86fee6dc1
|
| 3 |
+
size 33602915
|
nl_tasks/exp100/run_ex12/trainer_state.json
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 6252,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.09596928982725528,
|
| 14 |
+
"grad_norm": 0.13730373978614807,
|
| 15 |
+
"learning_rate": 0.007990472167684746,
|
| 16 |
+
"loss": 0.4657,
|
| 17 |
+
"step": 200
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.19193857965451055,
|
| 21 |
+
"grad_norm": 0.0858209878206253,
|
| 22 |
+
"learning_rate": 0.007941961752738508,
|
| 23 |
+
"loss": 0.2803,
|
| 24 |
+
"step": 400
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.28790786948176583,
|
| 28 |
+
"grad_norm": 0.06287017464637756,
|
| 29 |
+
"learning_rate": 0.007852857684857105,
|
| 30 |
+
"loss": 0.2572,
|
| 31 |
+
"step": 600
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.3838771593090211,
|
| 35 |
+
"grad_norm": 0.053079769015312195,
|
| 36 |
+
"learning_rate": 0.007724077542605112,
|
| 37 |
+
"loss": 0.2477,
|
| 38 |
+
"step": 800
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.4798464491362764,
|
| 42 |
+
"grad_norm": 0.03740881383419037,
|
| 43 |
+
"learning_rate": 0.007556947482025495,
|
| 44 |
+
"loss": 0.2368,
|
| 45 |
+
"step": 1000
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.5758157389635317,
|
| 49 |
+
"grad_norm": 0.034000739455223083,
|
| 50 |
+
"learning_rate": 0.007353188580109798,
|
| 51 |
+
"loss": 0.2351,
|
| 52 |
+
"step": 1200
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.6717850287907869,
|
| 56 |
+
"grad_norm": 0.04023474082350731,
|
| 57 |
+
"learning_rate": 0.007114899111439472,
|
| 58 |
+
"loss": 0.231,
|
| 59 |
+
"step": 1400
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.7677543186180422,
|
| 63 |
+
"grad_norm": 0.030921513214707375,
|
| 64 |
+
"learning_rate": 0.006844532940510433,
|
| 65 |
+
"loss": 0.223,
|
| 66 |
+
"step": 1600
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.8637236084452975,
|
| 70 |
+
"grad_norm": 0.03130370005965233,
|
| 71 |
+
"learning_rate": 0.006544874252253061,
|
| 72 |
+
"loss": 0.221,
|
| 73 |
+
"step": 1800
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.9596928982725528,
|
| 77 |
+
"grad_norm": 0.029764752835035324,
|
| 78 |
+
"learning_rate": 0.006219008880968432,
|
| 79 |
+
"loss": 0.2163,
|
| 80 |
+
"step": 2000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.055662188099808,
|
| 84 |
+
"grad_norm": 0.027886036783456802,
|
| 85 |
+
"learning_rate": 0.0058702925329305366,
|
| 86 |
+
"loss": 0.2034,
|
| 87 |
+
"step": 2200
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 1.1516314779270633,
|
| 91 |
+
"grad_norm": 0.026955854147672653,
|
| 92 |
+
"learning_rate": 0.005502316229892758,
|
| 93 |
+
"loss": 0.1974,
|
| 94 |
+
"step": 2400
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 1.2476007677543186,
|
| 98 |
+
"grad_norm": 0.024037910625338554,
|
| 99 |
+
"learning_rate": 0.005118869329355457,
|
| 100 |
+
"loss": 0.1958,
|
| 101 |
+
"step": 2600
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 1.3435700575815739,
|
| 105 |
+
"grad_norm": 0.02323935180902481,
|
| 106 |
+
"learning_rate": 0.004723900502405729,
|
| 107 |
+
"loss": 0.195,
|
| 108 |
+
"step": 2800
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 1.4395393474088292,
|
| 112 |
+
"grad_norm": 0.023859383538365364,
|
| 113 |
+
"learning_rate": 0.00432147707097288,
|
| 114 |
+
"loss": 0.1937,
|
| 115 |
+
"step": 3000
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.5355086372360844,
|
| 119 |
+
"grad_norm": 0.025017334148287773,
|
| 120 |
+
"learning_rate": 0.0039157431232376815,
|
| 121 |
+
"loss": 0.1901,
|
| 122 |
+
"step": 3200
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.6314779270633397,
|
| 126 |
+
"grad_norm": 0.024762826040387154,
|
| 127 |
+
"learning_rate": 0.0035108768385157816,
|
| 128 |
+
"loss": 0.1915,
|
| 129 |
+
"step": 3400
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.727447216890595,
|
| 133 |
+
"grad_norm": 0.023029997944831848,
|
| 134 |
+
"learning_rate": 0.0031110474610764154,
|
| 135 |
+
"loss": 0.189,
|
| 136 |
+
"step": 3600
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.8234165067178503,
|
| 140 |
+
"grad_norm": 0.023539869114756584,
|
| 141 |
+
"learning_rate": 0.0027203723659726987,
|
| 142 |
+
"loss": 0.1881,
|
| 143 |
+
"step": 3800
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.9193857965451055,
|
| 147 |
+
"grad_norm": 0.023440731689333916,
|
| 148 |
+
"learning_rate": 0.002342874659012299,
|
| 149 |
+
"loss": 0.1832,
|
| 150 |
+
"step": 4000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 2.015355086372361,
|
| 154 |
+
"grad_norm": 0.024401186034083366,
|
| 155 |
+
"learning_rate": 0.0019824417474968055,
|
| 156 |
+
"loss": 0.1798,
|
| 157 |
+
"step": 4200
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 2.111324376199616,
|
| 161 |
+
"grad_norm": 0.02438773214817047,
|
| 162 |
+
"learning_rate": 0.001642785308361261,
|
| 163 |
+
"loss": 0.1588,
|
| 164 |
+
"step": 4400
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 2.2072936660268714,
|
| 168 |
+
"grad_norm": 0.023876527324318886,
|
| 169 |
+
"learning_rate": 0.0013274030659551942,
|
| 170 |
+
"loss": 0.1585,
|
| 171 |
+
"step": 4600
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 2.3032629558541267,
|
| 175 |
+
"grad_norm": 0.021032139658927917,
|
| 176 |
+
"learning_rate": 0.001039542773071045,
|
| 177 |
+
"loss": 0.1594,
|
| 178 |
+
"step": 4800
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 2.399232245681382,
|
| 182 |
+
"grad_norm": 0.023737894371151924,
|
| 183 |
+
"learning_rate": 0.0007821687661372514,
|
| 184 |
+
"loss": 0.1545,
|
| 185 |
+
"step": 5000
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 2.495201535508637,
|
| 189 |
+
"grad_norm": 0.02574790269136429,
|
| 190 |
+
"learning_rate": 0.0005579314389849435,
|
| 191 |
+
"loss": 0.1574,
|
| 192 |
+
"step": 5200
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 2.5911708253358925,
|
| 196 |
+
"grad_norm": 0.022647960111498833,
|
| 197 |
+
"learning_rate": 0.00036913994954220007,
|
| 198 |
+
"loss": 0.157,
|
| 199 |
+
"step": 5400
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 2.6871401151631478,
|
| 203 |
+
"grad_norm": 0.024767184630036354,
|
| 204 |
+
"learning_rate": 0.00021773844051771986,
|
| 205 |
+
"loss": 0.1537,
|
| 206 |
+
"step": 5600
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 2.783109404990403,
|
| 210 |
+
"grad_norm": 0.02572454698383808,
|
| 211 |
+
"learning_rate": 0.00010528601894924394,
|
| 212 |
+
"loss": 0.1524,
|
| 213 |
+
"step": 5800
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 2.8790786948176583,
|
| 217 |
+
"grad_norm": 0.024511748924851418,
|
| 218 |
+
"learning_rate": 3.294070078395928e-05,
|
| 219 |
+
"loss": 0.1536,
|
| 220 |
+
"step": 6000
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 2.9750479846449136,
|
| 224 |
+
"grad_norm": 0.02418585494160652,
|
| 225 |
+
"learning_rate": 1.447485826858941e-06,
|
| 226 |
+
"loss": 0.1544,
|
| 227 |
+
"step": 6200
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 3.0,
|
| 231 |
+
"step": 6252,
|
| 232 |
+
"total_flos": 6.0970588176384e+18,
|
| 233 |
+
"train_loss": 0.2022177925951879,
|
| 234 |
+
"train_runtime": 8189.2736,
|
| 235 |
+
"train_samples_per_second": 36.633,
|
| 236 |
+
"train_steps_per_second": 0.763
|
| 237 |
+
}
|
| 238 |
+
],
|
| 239 |
+
"logging_steps": 200,
|
| 240 |
+
"max_steps": 6252,
|
| 241 |
+
"num_input_tokens_seen": 0,
|
| 242 |
+
"num_train_epochs": 3,
|
| 243 |
+
"save_steps": 0,
|
| 244 |
+
"stateful_callbacks": {
|
| 245 |
+
"TrainerControl": {
|
| 246 |
+
"args": {
|
| 247 |
+
"should_epoch_stop": false,
|
| 248 |
+
"should_evaluate": false,
|
| 249 |
+
"should_log": false,
|
| 250 |
+
"should_save": true,
|
| 251 |
+
"should_training_stop": true
|
| 252 |
+
},
|
| 253 |
+
"attributes": {}
|
| 254 |
+
}
|
| 255 |
+
},
|
| 256 |
+
"total_flos": 6.0970588176384e+18,
|
| 257 |
+
"train_batch_size": 48,
|
| 258 |
+
"trial_name": null,
|
| 259 |
+
"trial_params": null
|
| 260 |
+
}
|
nl_tasks/expsBOFT/seed43/trainer_state.json
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1250,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.08,
|
| 14 |
+
"grad_norm": 0.08375173062086105,
|
| 15 |
+
"learning_rate": 0.000392,
|
| 16 |
+
"loss": 0.5193,
|
| 17 |
+
"step": 50
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.16,
|
| 21 |
+
"grad_norm": 0.09268203377723694,
|
| 22 |
+
"learning_rate": 0.0007920000000000001,
|
| 23 |
+
"loss": 0.3316,
|
| 24 |
+
"step": 100
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.24,
|
| 28 |
+
"grad_norm": 0.08198747783899307,
|
| 29 |
+
"learning_rate": 0.0007964216926581925,
|
| 30 |
+
"loss": 0.304,
|
| 31 |
+
"step": 150
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.32,
|
| 35 |
+
"grad_norm": 0.0816216915845871,
|
| 36 |
+
"learning_rate": 0.0007854602918076551,
|
| 37 |
+
"loss": 0.2918,
|
| 38 |
+
"step": 200
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.4,
|
| 42 |
+
"grad_norm": 0.07457849383354187,
|
| 43 |
+
"learning_rate": 0.0007673184950396212,
|
| 44 |
+
"loss": 0.274,
|
| 45 |
+
"step": 250
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.48,
|
| 49 |
+
"grad_norm": 0.07685171067714691,
|
| 50 |
+
"learning_rate": 0.0007423342497022817,
|
| 51 |
+
"loss": 0.2687,
|
| 52 |
+
"step": 300
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.56,
|
| 56 |
+
"grad_norm": 0.07849128544330597,
|
| 57 |
+
"learning_rate": 0.0007109729650142636,
|
| 58 |
+
"loss": 0.2651,
|
| 59 |
+
"step": 350
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.64,
|
| 63 |
+
"grad_norm": 0.07266736030578613,
|
| 64 |
+
"learning_rate": 0.0006738188423714755,
|
| 65 |
+
"loss": 0.2575,
|
| 66 |
+
"step": 400
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.72,
|
| 70 |
+
"grad_norm": 0.06927025318145752,
|
| 71 |
+
"learning_rate": 0.0006315639927804526,
|
| 72 |
+
"loss": 0.2525,
|
| 73 |
+
"step": 450
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.8,
|
| 77 |
+
"grad_norm": 0.08536054193973541,
|
| 78 |
+
"learning_rate": 0.00058499554413983,
|
| 79 |
+
"loss": 0.2494,
|
| 80 |
+
"step": 500
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.88,
|
| 84 |
+
"grad_norm": 0.07602768391370773,
|
| 85 |
+
"learning_rate": 0.000534980978536894,
|
| 86 |
+
"loss": 0.2429,
|
| 87 |
+
"step": 550
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.96,
|
| 91 |
+
"grad_norm": 0.07055249065160751,
|
| 92 |
+
"learning_rate": 0.00048245197269763485,
|
| 93 |
+
"loss": 0.2457,
|
| 94 |
+
"step": 600
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 1.04,
|
| 98 |
+
"grad_norm": 0.07144515216350555,
|
| 99 |
+
"learning_rate": 0.00042838704261214224,
|
| 100 |
+
"loss": 0.2292,
|
| 101 |
+
"step": 650
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 1.12,
|
| 105 |
+
"grad_norm": 0.07937044650316238,
|
| 106 |
+
"learning_rate": 0.00037379331563313267,
|
| 107 |
+
"loss": 0.2169,
|
| 108 |
+
"step": 700
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 1.2,
|
| 112 |
+
"grad_norm": 0.07409252226352692,
|
| 113 |
+
"learning_rate": 0.00031968776959892677,
|
| 114 |
+
"loss": 0.2098,
|
| 115 |
+
"step": 750
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.28,
|
| 119 |
+
"grad_norm": 0.07844420522451401,
|
| 120 |
+
"learning_rate": 0.00026707828846051743,
|
| 121 |
+
"loss": 0.2145,
|
| 122 |
+
"step": 800
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.3599999999999999,
|
| 126 |
+
"grad_norm": 0.07791652530431747,
|
| 127 |
+
"learning_rate": 0.00021694488731055218,
|
| 128 |
+
"loss": 0.2082,
|
| 129 |
+
"step": 850
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.44,
|
| 133 |
+
"grad_norm": 0.0782908946275711,
|
| 134 |
+
"learning_rate": 0.00017022145655641685,
|
| 135 |
+
"loss": 0.2077,
|
| 136 |
+
"step": 900
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.52,
|
| 140 |
+
"grad_norm": 0.0826650932431221,
|
| 141 |
+
"learning_rate": 0.00012777836530893536,
|
| 142 |
+
"loss": 0.2137,
|
| 143 |
+
"step": 950
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.6,
|
| 147 |
+
"grad_norm": 0.0696156919002533,
|
| 148 |
+
"learning_rate": 9.040624805263558e-05,
|
| 149 |
+
"loss": 0.2076,
|
| 150 |
+
"step": 1000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 1.6800000000000002,
|
| 154 |
+
"grad_norm": 0.06966507434844971,
|
| 155 |
+
"learning_rate": 5.880127662124091e-05,
|
| 156 |
+
"loss": 0.2108,
|
| 157 |
+
"step": 1050
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 1.76,
|
| 161 |
+
"grad_norm": 0.08326321095228195,
|
| 162 |
+
"learning_rate": 3.355219183361582e-05,
|
| 163 |
+
"loss": 0.2106,
|
| 164 |
+
"step": 1100
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 1.8399999999999999,
|
| 168 |
+
"grad_norm": 0.0792745053768158,
|
| 169 |
+
"learning_rate": 1.512933636625089e-05,
|
| 170 |
+
"loss": 0.2073,
|
| 171 |
+
"step": 1150
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 1.92,
|
| 175 |
+
"grad_norm": 0.07648582756519318,
|
| 176 |
+
"learning_rate": 3.8758931591217575e-06,
|
| 177 |
+
"loss": 0.209,
|
| 178 |
+
"step": 1200
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 2.0,
|
| 182 |
+
"grad_norm": 0.0787830799818039,
|
| 183 |
+
"learning_rate": 1.4925668450960217e-09,
|
| 184 |
+
"loss": 0.2124,
|
| 185 |
+
"step": 1250
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 2.0,
|
| 189 |
+
"step": 1250,
|
| 190 |
+
"total_flos": 1.62594677587968e+18,
|
| 191 |
+
"train_loss": 0.25041088790893556,
|
| 192 |
+
"train_runtime": 3374.0916,
|
| 193 |
+
"train_samples_per_second": 23.71,
|
| 194 |
+
"train_steps_per_second": 0.37
|
| 195 |
+
}
|
| 196 |
+
],
|
| 197 |
+
"logging_steps": 50,
|
| 198 |
+
"max_steps": 1250,
|
| 199 |
+
"num_input_tokens_seen": 0,
|
| 200 |
+
"num_train_epochs": 2,
|
| 201 |
+
"save_steps": 0,
|
| 202 |
+
"stateful_callbacks": {
|
| 203 |
+
"TrainerControl": {
|
| 204 |
+
"args": {
|
| 205 |
+
"should_epoch_stop": false,
|
| 206 |
+
"should_evaluate": false,
|
| 207 |
+
"should_log": false,
|
| 208 |
+
"should_save": false,
|
| 209 |
+
"should_training_stop": false
|
| 210 |
+
},
|
| 211 |
+
"attributes": {}
|
| 212 |
+
}
|
| 213 |
+
},
|
| 214 |
+
"total_flos": 1.62594677587968e+18,
|
| 215 |
+
"train_batch_size": 32,
|
| 216 |
+
"trial_name": null,
|
| 217 |
+
"trial_params": null
|
| 218 |
+
}
|
nl_tasks/expsBOFT/seed44/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/expsBOFT/seed44/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/expsBOFT/seed44/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/expsBOFT/seed44/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/expsBOFT/seed44/ft2/README.md
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: meta-llama/Llama-2-7b-hf
|
| 3 |
+
library_name: peft
|
| 4 |
+
tags:
|
| 5 |
+
- base_model:adapter:meta-llama/Llama-2-7b-hf
|
| 6 |
+
- transformers
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
# Model Card for Model ID
|
| 10 |
+
|
| 11 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
## Model Details
|
| 16 |
+
|
| 17 |
+
### Model Description
|
| 18 |
+
|
| 19 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
- **Developed by:** [More Information Needed]
|
| 24 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 25 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 26 |
+
- **Model type:** [More Information Needed]
|
| 27 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 28 |
+
- **License:** [More Information Needed]
|
| 29 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 30 |
+
|
| 31 |
+
### Model Sources [optional]
|
| 32 |
+
|
| 33 |
+
<!-- Provide the basic links for the model. -->
|
| 34 |
+
|
| 35 |
+
- **Repository:** [More Information Needed]
|
| 36 |
+
- **Paper [optional]:** [More Information Needed]
|
| 37 |
+
- **Demo [optional]:** [More Information Needed]
|
| 38 |
+
|
| 39 |
+
## Uses
|
| 40 |
+
|
| 41 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 42 |
+
|
| 43 |
+
### Direct Use
|
| 44 |
+
|
| 45 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 46 |
+
|
| 47 |
+
[More Information Needed]
|
| 48 |
+
|
| 49 |
+
### Downstream Use [optional]
|
| 50 |
+
|
| 51 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 52 |
+
|
| 53 |
+
[More Information Needed]
|
| 54 |
+
|
| 55 |
+
### Out-of-Scope Use
|
| 56 |
+
|
| 57 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 58 |
+
|
| 59 |
+
[More Information Needed]
|
| 60 |
+
|
| 61 |
+
## Bias, Risks, and Limitations
|
| 62 |
+
|
| 63 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 64 |
+
|
| 65 |
+
[More Information Needed]
|
| 66 |
+
|
| 67 |
+
### Recommendations
|
| 68 |
+
|
| 69 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 70 |
+
|
| 71 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 72 |
+
|
| 73 |
+
## How to Get Started with the Model
|
| 74 |
+
|
| 75 |
+
Use the code below to get started with the model.
|
| 76 |
+
|
| 77 |
+
[More Information Needed]
|
| 78 |
+
|
| 79 |
+
## Training Details
|
| 80 |
+
|
| 81 |
+
### Training Data
|
| 82 |
+
|
| 83 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 84 |
+
|
| 85 |
+
[More Information Needed]
|
| 86 |
+
|
| 87 |
+
### Training Procedure
|
| 88 |
+
|
| 89 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 90 |
+
|
| 91 |
+
#### Preprocessing [optional]
|
| 92 |
+
|
| 93 |
+
[More Information Needed]
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
#### Training Hyperparameters
|
| 97 |
+
|
| 98 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 99 |
+
|
| 100 |
+
#### Speeds, Sizes, Times [optional]
|
| 101 |
+
|
| 102 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 103 |
+
|
| 104 |
+
[More Information Needed]
|
| 105 |
+
|
| 106 |
+
## Evaluation
|
| 107 |
+
|
| 108 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 109 |
+
|
| 110 |
+
### Testing Data, Factors & Metrics
|
| 111 |
+
|
| 112 |
+
#### Testing Data
|
| 113 |
+
|
| 114 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 115 |
+
|
| 116 |
+
[More Information Needed]
|
| 117 |
+
|
| 118 |
+
#### Factors
|
| 119 |
+
|
| 120 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 121 |
+
|
| 122 |
+
[More Information Needed]
|
| 123 |
+
|
| 124 |
+
#### Metrics
|
| 125 |
+
|
| 126 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 127 |
+
|
| 128 |
+
[More Information Needed]
|
| 129 |
+
|
| 130 |
+
### Results
|
| 131 |
+
|
| 132 |
+
[More Information Needed]
|
| 133 |
+
|
| 134 |
+
#### Summary
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
## Model Examination [optional]
|
| 139 |
+
|
| 140 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 141 |
+
|
| 142 |
+
[More Information Needed]
|
| 143 |
+
|
| 144 |
+
## Environmental Impact
|
| 145 |
+
|
| 146 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 147 |
+
|
| 148 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 149 |
+
|
| 150 |
+
- **Hardware Type:** [More Information Needed]
|
| 151 |
+
- **Hours used:** [More Information Needed]
|
| 152 |
+
- **Cloud Provider:** [More Information Needed]
|
| 153 |
+
- **Compute Region:** [More Information Needed]
|
| 154 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 155 |
+
|
| 156 |
+
## Technical Specifications [optional]
|
| 157 |
+
|
| 158 |
+
### Model Architecture and Objective
|
| 159 |
+
|
| 160 |
+
[More Information Needed]
|
| 161 |
+
|
| 162 |
+
### Compute Infrastructure
|
| 163 |
+
|
| 164 |
+
[More Information Needed]
|
| 165 |
+
|
| 166 |
+
#### Hardware
|
| 167 |
+
|
| 168 |
+
[More Information Needed]
|
| 169 |
+
|
| 170 |
+
#### Software
|
| 171 |
+
|
| 172 |
+
[More Information Needed]
|
| 173 |
+
|
| 174 |
+
## Citation [optional]
|
| 175 |
+
|
| 176 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 177 |
+
|
| 178 |
+
**BibTeX:**
|
| 179 |
+
|
| 180 |
+
[More Information Needed]
|
| 181 |
+
|
| 182 |
+
**APA:**
|
| 183 |
+
|
| 184 |
+
[More Information Needed]
|
| 185 |
+
|
| 186 |
+
## Glossary [optional]
|
| 187 |
+
|
| 188 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 189 |
+
|
| 190 |
+
[More Information Needed]
|
| 191 |
+
|
| 192 |
+
## More Information [optional]
|
| 193 |
+
|
| 194 |
+
[More Information Needed]
|
| 195 |
+
|
| 196 |
+
## Model Card Authors [optional]
|
| 197 |
+
|
| 198 |
+
[More Information Needed]
|
| 199 |
+
|
| 200 |
+
## Model Card Contact
|
| 201 |
+
|
| 202 |
+
[More Information Needed]
|
| 203 |
+
### Framework versions
|
| 204 |
+
|
| 205 |
+
- PEFT 0.18.0
|
nl_tasks/expsBOFT/seed44/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"auto_mapping": {
|
| 3 |
+
"base_model_class": "LlamaForCausalLM",
|
| 4 |
+
"parent_library": "transformers.models.llama.modeling_llama"
|
| 5 |
+
},
|
| 6 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"boft_block_num": 0,
|
| 9 |
+
"boft_block_size": 16,
|
| 10 |
+
"boft_dropout": 0.05,
|
| 11 |
+
"boft_n_butterfly_factor": 2,
|
| 12 |
+
"exclude_modules": null,
|
| 13 |
+
"fan_in_fan_out": false,
|
| 14 |
+
"inference_mode": true,
|
| 15 |
+
"init_weights": true,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"modules_to_save": null,
|
| 19 |
+
"peft_type": "BOFT",
|
| 20 |
+
"peft_version": "0.18.0",
|
| 21 |
+
"revision": null,
|
| 22 |
+
"target_modules": [
|
| 23 |
+
"q_proj",
|
| 24 |
+
"v_proj"
|
| 25 |
+
],
|
| 26 |
+
"task_type": null
|
| 27 |
+
}
|
nl_tasks/expsBOFT/seed44/ft2/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:584526a06a1f45f2f77e6a89a7201b05aa25a3d6be60f231b255a32c48c4b261
|
| 3 |
+
size 34619504
|
nl_tasks/expsBOFT/seed44/trainer_state.json
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1250,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.08,
|
| 14 |
+
"grad_norm": 0.08375173062086105,
|
| 15 |
+
"learning_rate": 0.000392,
|
| 16 |
+
"loss": 0.5193,
|
| 17 |
+
"step": 50
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.16,
|
| 21 |
+
"grad_norm": 0.09268203377723694,
|
| 22 |
+
"learning_rate": 0.0007920000000000001,
|
| 23 |
+
"loss": 0.3316,
|
| 24 |
+
"step": 100
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.24,
|
| 28 |
+
"grad_norm": 0.08198747783899307,
|
| 29 |
+
"learning_rate": 0.0007964216926581925,
|
| 30 |
+
"loss": 0.304,
|
| 31 |
+
"step": 150
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.32,
|
| 35 |
+
"grad_norm": 0.0816216915845871,
|
| 36 |
+
"learning_rate": 0.0007854602918076551,
|
| 37 |
+
"loss": 0.2918,
|
| 38 |
+
"step": 200
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.4,
|
| 42 |
+
"grad_norm": 0.07457849383354187,
|
| 43 |
+
"learning_rate": 0.0007673184950396212,
|
| 44 |
+
"loss": 0.274,
|
| 45 |
+
"step": 250
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.48,
|
| 49 |
+
"grad_norm": 0.07685171067714691,
|
| 50 |
+
"learning_rate": 0.0007423342497022817,
|
| 51 |
+
"loss": 0.2687,
|
| 52 |
+
"step": 300
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.56,
|
| 56 |
+
"grad_norm": 0.07849128544330597,
|
| 57 |
+
"learning_rate": 0.0007109729650142636,
|
| 58 |
+
"loss": 0.2651,
|
| 59 |
+
"step": 350
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.64,
|
| 63 |
+
"grad_norm": 0.07266736030578613,
|
| 64 |
+
"learning_rate": 0.0006738188423714755,
|
| 65 |
+
"loss": 0.2575,
|
| 66 |
+
"step": 400
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.72,
|
| 70 |
+
"grad_norm": 0.06927025318145752,
|
| 71 |
+
"learning_rate": 0.0006315639927804526,
|
| 72 |
+
"loss": 0.2525,
|
| 73 |
+
"step": 450
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.8,
|
| 77 |
+
"grad_norm": 0.08536054193973541,
|
| 78 |
+
"learning_rate": 0.00058499554413983,
|
| 79 |
+
"loss": 0.2494,
|
| 80 |
+
"step": 500
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.88,
|
| 84 |
+
"grad_norm": 0.07602768391370773,
|
| 85 |
+
"learning_rate": 0.000534980978536894,
|
| 86 |
+
"loss": 0.2429,
|
| 87 |
+
"step": 550
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.96,
|
| 91 |
+
"grad_norm": 0.07055249065160751,
|
| 92 |
+
"learning_rate": 0.00048245197269763485,
|
| 93 |
+
"loss": 0.2457,
|
| 94 |
+
"step": 600
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 1.04,
|
| 98 |
+
"grad_norm": 0.07144515216350555,
|
| 99 |
+
"learning_rate": 0.00042838704261214224,
|
| 100 |
+
"loss": 0.2292,
|
| 101 |
+
"step": 650
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 1.12,
|
| 105 |
+
"grad_norm": 0.07937044650316238,
|
| 106 |
+
"learning_rate": 0.00037379331563313267,
|
| 107 |
+
"loss": 0.2169,
|
| 108 |
+
"step": 700
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 1.2,
|
| 112 |
+
"grad_norm": 0.07409252226352692,
|
| 113 |
+
"learning_rate": 0.00031968776959892677,
|
| 114 |
+
"loss": 0.2098,
|
| 115 |
+
"step": 750
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 1.28,
|
| 119 |
+
"grad_norm": 0.07844420522451401,
|
| 120 |
+
"learning_rate": 0.00026707828846051743,
|
| 121 |
+
"loss": 0.2145,
|
| 122 |
+
"step": 800
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 1.3599999999999999,
|
| 126 |
+
"grad_norm": 0.07791652530431747,
|
| 127 |
+
"learning_rate": 0.00021694488731055218,
|
| 128 |
+
"loss": 0.2082,
|
| 129 |
+
"step": 850
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 1.44,
|
| 133 |
+
"grad_norm": 0.0782908946275711,
|
| 134 |
+
"learning_rate": 0.00017022145655641685,
|
| 135 |
+
"loss": 0.2077,
|
| 136 |
+
"step": 900
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 1.52,
|
| 140 |
+
"grad_norm": 0.0826650932431221,
|
| 141 |
+
"learning_rate": 0.00012777836530893536,
|
| 142 |
+
"loss": 0.2137,
|
| 143 |
+
"step": 950
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 1.6,
|
| 147 |
+
"grad_norm": 0.0696156919002533,
|
| 148 |
+
"learning_rate": 9.040624805263558e-05,
|
| 149 |
+
"loss": 0.2076,
|
| 150 |
+
"step": 1000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 1.6800000000000002,
|
| 154 |
+
"grad_norm": 0.06966507434844971,
|
| 155 |
+
"learning_rate": 5.880127662124091e-05,
|
| 156 |
+
"loss": 0.2108,
|
| 157 |
+
"step": 1050
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 1.76,
|
| 161 |
+
"grad_norm": 0.08326321095228195,
|
| 162 |
+
"learning_rate": 3.355219183361582e-05,
|
| 163 |
+
"loss": 0.2106,
|
| 164 |
+
"step": 1100
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 1.8399999999999999,
|
| 168 |
+
"grad_norm": 0.0792745053768158,
|
| 169 |
+
"learning_rate": 1.512933636625089e-05,
|
| 170 |
+
"loss": 0.2073,
|
| 171 |
+
"step": 1150
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 1.92,
|
| 175 |
+
"grad_norm": 0.07648582756519318,
|
| 176 |
+
"learning_rate": 3.8758931591217575e-06,
|
| 177 |
+
"loss": 0.209,
|
| 178 |
+
"step": 1200
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 2.0,
|
| 182 |
+
"grad_norm": 0.0787830799818039,
|
| 183 |
+
"learning_rate": 1.4925668450960217e-09,
|
| 184 |
+
"loss": 0.2124,
|
| 185 |
+
"step": 1250
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 2.0,
|
| 189 |
+
"step": 1250,
|
| 190 |
+
"total_flos": 1.62594677587968e+18,
|
| 191 |
+
"train_loss": 0.25041088790893556,
|
| 192 |
+
"train_runtime": 3377.6799,
|
| 193 |
+
"train_samples_per_second": 23.685,
|
| 194 |
+
"train_steps_per_second": 0.37
|
| 195 |
+
}
|
| 196 |
+
],
|
| 197 |
+
"logging_steps": 50,
|
| 198 |
+
"max_steps": 1250,
|
| 199 |
+
"num_input_tokens_seen": 0,
|
| 200 |
+
"num_train_epochs": 2,
|
| 201 |
+
"save_steps": 0,
|
| 202 |
+
"stateful_callbacks": {
|
| 203 |
+
"TrainerControl": {
|
| 204 |
+
"args": {
|
| 205 |
+
"should_epoch_stop": false,
|
| 206 |
+
"should_evaluate": false,
|
| 207 |
+
"should_log": false,
|
| 208 |
+
"should_save": false,
|
| 209 |
+
"should_training_stop": false
|
| 210 |
+
},
|
| 211 |
+
"attributes": {}
|
| 212 |
+
}
|
| 213 |
+
},
|
| 214 |
+
"total_flos": 1.62594677587968e+18,
|
| 215 |
+
"train_batch_size": 32,
|
| 216 |
+
"trial_name": null,
|
| 217 |
+
"trial_params": null
|
| 218 |
+
}
|
nl_tasks/inference/MATH_infer.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
import pdb
|
| 4 |
+
import jsonlines
|
| 5 |
+
|
| 6 |
+
import util
|
| 7 |
+
from vllm import LLM, SamplingParams
|
| 8 |
+
import sys
|
| 9 |
+
MAX_INT = sys.maxsize
|
| 10 |
+
INVALID_ANS = "[invalid]"
|
| 11 |
+
MAX_TOKEN = 1408
|
| 12 |
+
|
| 13 |
+
import random
|
| 14 |
+
import numpy as np
|
| 15 |
+
import torch
|
| 16 |
+
import os
|
| 17 |
+
|
| 18 |
+
invalid_outputs = []
|
| 19 |
+
def remove_boxed(s):
|
| 20 |
+
left = "\\boxed{"
|
| 21 |
+
try:
|
| 22 |
+
assert s[:len(left)] == left
|
| 23 |
+
assert s[-1] == "}"
|
| 24 |
+
return s[len(left):-1]
|
| 25 |
+
except:
|
| 26 |
+
return None
|
| 27 |
+
|
| 28 |
+
def process_results(doc, completion, answer):
|
| 29 |
+
split_ans = completion.split('The answer is: ')
|
| 30 |
+
if len(split_ans) > 1:
|
| 31 |
+
ans = split_ans[-1]
|
| 32 |
+
extract_ans_temp = ans.split('.\n')[0]
|
| 33 |
+
extract_ans_temp = extract_ans_temp.strip()
|
| 34 |
+
if len(extract_ans_temp)>0 and extract_ans_temp[-1] == '.':
|
| 35 |
+
extract_ans = extract_ans_temp[0:-1]
|
| 36 |
+
else:
|
| 37 |
+
extract_ans = extract_ans_temp
|
| 38 |
+
extract_ans = extract_ans.strip()
|
| 39 |
+
if util.is_equiv(extract_ans, answer):
|
| 40 |
+
return True
|
| 41 |
+
else:
|
| 42 |
+
return False
|
| 43 |
+
else:
|
| 44 |
+
temp = {'question': doc, 'output': completion, 'answer': answer}
|
| 45 |
+
invalid_outputs.append(temp)
|
| 46 |
+
return False
|
| 47 |
+
def batch_data(data_list, batch_size=1):
|
| 48 |
+
n = len(data_list) // batch_size
|
| 49 |
+
batch_data = []
|
| 50 |
+
for i in range(n-1):
|
| 51 |
+
start = i * batch_size
|
| 52 |
+
end = (i+1)*batch_size
|
| 53 |
+
batch_data.append(data_list[start:end])
|
| 54 |
+
|
| 55 |
+
last_start = (n-1) * batch_size
|
| 56 |
+
last_end = MAX_INT
|
| 57 |
+
batch_data.append(data_list[last_start:last_end])
|
| 58 |
+
return batch_data
|
| 59 |
+
|
| 60 |
+
def test_hendrycks_math(model, data_path, start=0, end=MAX_INT, batch_size=1, tensor_parallel_size=1):
|
| 61 |
+
hendrycks_math_ins = []
|
| 62 |
+
hendrycks_math_answers = []
|
| 63 |
+
problem_prompt = (
|
| 64 |
+
"Below is an instruction that describes a task. "
|
| 65 |
+
"Write a response that appropriately completes the request.\n\n"
|
| 66 |
+
"### Instruction:\n{instruction}\n\n### Response: Let's think step by step."
|
| 67 |
+
)
|
| 68 |
+
print('promt =====', problem_prompt)
|
| 69 |
+
with open(data_path, "r+", encoding="utf8") as f:
|
| 70 |
+
for idx, item in enumerate(jsonlines.Reader(f)):
|
| 71 |
+
temp_instr = problem_prompt.format(instruction=item["instruction"])
|
| 72 |
+
hendrycks_math_ins.append(temp_instr)
|
| 73 |
+
solution = item['output']
|
| 74 |
+
temp_ans = remove_boxed(util.last_boxed_only_string(solution))
|
| 75 |
+
hendrycks_math_answers.append(temp_ans)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
print('total length ===', len(hendrycks_math_ins))
|
| 79 |
+
hendrycks_math_ins = hendrycks_math_ins[start:end]
|
| 80 |
+
hendrycks_math_answers = hendrycks_math_answers[start:end]
|
| 81 |
+
print('lenght ====', len(hendrycks_math_ins))
|
| 82 |
+
# batch_hendrycks_math_ins = batch_data(hendrycks_math_ins, batch_size=batch_size)
|
| 83 |
+
|
| 84 |
+
stop_tokens = ["Instruction:", "Instruction", "Response:", "Response"]
|
| 85 |
+
sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=MAX_TOKEN, stop=stop_tokens)
|
| 86 |
+
print('sampleing =====', sampling_params)
|
| 87 |
+
llm = LLM(model=model,tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=0.95)
|
| 88 |
+
|
| 89 |
+
outputs = llm.generate(hendrycks_math_ins, sampling_params)
|
| 90 |
+
res_completions = [output.outputs[0].text for output in outputs]
|
| 91 |
+
|
| 92 |
+
results = []
|
| 93 |
+
for idx, (prompt, completion, prompt_answer) in enumerate(zip(hendrycks_math_ins, res_completions, hendrycks_math_answers)):
|
| 94 |
+
res = process_results(prompt, completion, prompt_answer)
|
| 95 |
+
results.append(res)
|
| 96 |
+
|
| 97 |
+
acc = sum(results) / len(results)
|
| 98 |
+
print('len invalid outputs ====', len(invalid_outputs), ', invalid_outputs===', len(invalid_outputs))
|
| 99 |
+
# print('start===', start, ', end====',end)
|
| 100 |
+
print('length====', len(results), ', acc====', acc*100)
|
| 101 |
+
|
| 102 |
+
current_path = args.model
|
| 103 |
+
parent_dir = os.path.dirname(current_path.rstrip('/'))
|
| 104 |
+
output_filename = os.path.join(parent_dir, 'output.txt')
|
| 105 |
+
# output_filename = args.model + 'output.txt'
|
| 106 |
+
with open(output_filename, "a", encoding="utf-8") as f:
|
| 107 |
+
print(f'\n MATH math MAX TOKEN = {MAX_TOKEN}, length==== {len(results)}, math acc %====, {acc*100}', file=f)
|
| 108 |
+
|
| 109 |
+
def parse_args():
|
| 110 |
+
parser = argparse.ArgumentParser()
|
| 111 |
+
parser.add_argument("--model", type=str, default=0) # model path
|
| 112 |
+
parser.add_argument("--data_file", type=str, default='data/MATH_test.jsonl') # data path
|
| 113 |
+
parser.add_argument("--start", type=int, default=0) #start index
|
| 114 |
+
parser.add_argument("--end", type=int, default=MAX_INT) # end index
|
| 115 |
+
parser.add_argument("--batch_size", type=int, default=50) # batch_size
|
| 116 |
+
parser.add_argument("--tensor_parallel_size", type=int, default=1) # tensor_parallel_size
|
| 117 |
+
return parser.parse_args()
|
| 118 |
+
|
| 119 |
+
def set_deterministic_seed(seed=42):
|
| 120 |
+
random.seed(seed)
|
| 121 |
+
np.random.seed(seed)
|
| 122 |
+
torch.manual_seed(seed)
|
| 123 |
+
torch.cuda.manual_seed_all(seed)
|
| 124 |
+
# torch.backends.cudnn.deterministic = True
|
| 125 |
+
# torch.backends.cudnn.benchmark = False
|
| 126 |
+
|
| 127 |
+
if __name__ == "__main__":
|
| 128 |
+
args = parse_args()
|
| 129 |
+
set_deterministic_seed()
|
| 130 |
+
test_hendrycks_math(model=args.model, data_path=args.data_file, start=args.start, end=args.end, batch_size=args.batch_size, tensor_parallel_size=args.tensor_parallel_size)
|
| 131 |
+
print('math ends', args.model)
|
| 132 |
+
|
nl_tasks/inference/grader.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This logic is largely copied from the Hendrycks' MATH release (math_equivalence), and borrowed from:
|
| 3 |
+
- https://github.com/microsoft/ProphetNet/tree/master/CRITIC
|
| 4 |
+
"""
|
| 5 |
+
import multiprocessing
|
| 6 |
+
from math import isclose
|
| 7 |
+
from typing import Union
|
| 8 |
+
|
| 9 |
+
from sympy import simplify, N
|
| 10 |
+
from sympy.parsing.sympy_parser import parse_expr
|
| 11 |
+
from sympy.parsing.latex import parse_latex
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def is_digit(s):
|
| 15 |
+
try:
|
| 16 |
+
float(str(s).replace(",", ""))
|
| 17 |
+
return True
|
| 18 |
+
except ValueError:
|
| 19 |
+
return False
|
| 20 |
+
|
| 21 |
+
def math_equal(prediction: Union[bool, float, str],
|
| 22 |
+
reference: Union[float, str],
|
| 23 |
+
include_percentage: bool = True,
|
| 24 |
+
is_close: bool = True,
|
| 25 |
+
timeout: bool = False,
|
| 26 |
+
) -> bool:
|
| 27 |
+
"""
|
| 28 |
+
Exact match of math if and only if:
|
| 29 |
+
1. numerical equal: both can convert to float and are equal
|
| 30 |
+
2. symbolic equal: both can convert to sympy expression and are equal
|
| 31 |
+
"""
|
| 32 |
+
try: # 1. numerical equal
|
| 33 |
+
if is_digit(prediction) and is_digit(reference):
|
| 34 |
+
prediction = float(str(prediction).replace(",", ""))
|
| 35 |
+
reference = float(str(reference).replace(",", ""))
|
| 36 |
+
# number questions
|
| 37 |
+
if include_percentage:
|
| 38 |
+
gt_result = [reference / 100, reference, reference * 100]
|
| 39 |
+
else:
|
| 40 |
+
gt_result = [reference]
|
| 41 |
+
for item in gt_result:
|
| 42 |
+
try:
|
| 43 |
+
if is_close:
|
| 44 |
+
if isclose(item, prediction, rel_tol=1e-4):
|
| 45 |
+
return True
|
| 46 |
+
else:
|
| 47 |
+
if item == prediction:
|
| 48 |
+
return True
|
| 49 |
+
except Exception:
|
| 50 |
+
continue
|
| 51 |
+
return False
|
| 52 |
+
except:
|
| 53 |
+
pass
|
| 54 |
+
|
| 55 |
+
if not prediction and prediction not in [0, False]:
|
| 56 |
+
return False
|
| 57 |
+
|
| 58 |
+
# 2. symbolic equal
|
| 59 |
+
reference = str(reference).strip()
|
| 60 |
+
prediction = str(prediction).strip()
|
| 61 |
+
|
| 62 |
+
## deal with [], (), {}
|
| 63 |
+
pred_str, ref_str = prediction, reference
|
| 64 |
+
if (prediction.startswith("[") and prediction.endswith("]") and not reference.startswith("(")) or \
|
| 65 |
+
(prediction.startswith("(") and prediction.endswith(")") and not reference.startswith("[")):
|
| 66 |
+
pred_str = pred_str.strip("[]()")
|
| 67 |
+
ref_str = ref_str.strip("[]()")
|
| 68 |
+
for s in ['{', "}", "(", ")"]:
|
| 69 |
+
ref_str = ref_str.replace(s, "")
|
| 70 |
+
pred_str = pred_str.replace(s, "")
|
| 71 |
+
if pred_str == ref_str:
|
| 72 |
+
return True
|
| 73 |
+
|
| 74 |
+
## [a, b] vs. [c, d], return a==c and b==d
|
| 75 |
+
if (prediction.startswith("[") and prediction.endswith("]")) and (reference.startswith("[") and reference.endswith("]")) or \
|
| 76 |
+
(prediction.startswith("(") and prediction.endswith(")")) and (reference.startswith("(") and reference.endswith(")")):
|
| 77 |
+
pred_parts = prediction[1:-1].split(",")
|
| 78 |
+
ref_parts = reference[1:-1].split(",")
|
| 79 |
+
if len(pred_parts) == len(ref_parts):
|
| 80 |
+
if all([math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close) for i in range(len(pred_parts))]):
|
| 81 |
+
return True
|
| 82 |
+
|
| 83 |
+
# symbolic equal with sympy
|
| 84 |
+
if timeout:
|
| 85 |
+
if call_with_timeout(symbolic_equal_process, prediction, reference):
|
| 86 |
+
return True
|
| 87 |
+
else:
|
| 88 |
+
if symbolic_equal(prediction, reference):
|
| 89 |
+
return True
|
| 90 |
+
|
| 91 |
+
return False
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def math_equal_process(param):
|
| 95 |
+
return math_equal(param[-2], param[-1])
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def symbolic_equal(a, b):
|
| 99 |
+
def _parse(s):
|
| 100 |
+
for f in [parse_latex, parse_expr]:
|
| 101 |
+
try:
|
| 102 |
+
return f(s)
|
| 103 |
+
except:
|
| 104 |
+
pass
|
| 105 |
+
return s
|
| 106 |
+
a = _parse(a)
|
| 107 |
+
b = _parse(b)
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
if simplify(a-b) == 0:
|
| 111 |
+
return True
|
| 112 |
+
except:
|
| 113 |
+
pass
|
| 114 |
+
|
| 115 |
+
try:
|
| 116 |
+
if isclose(N(a), N(b), rel_tol=1e-3):
|
| 117 |
+
return True
|
| 118 |
+
except:
|
| 119 |
+
pass
|
| 120 |
+
return False
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def symbolic_equal_process(a, b, output_queue):
|
| 124 |
+
result = symbolic_equal(a, b)
|
| 125 |
+
output_queue.put(result)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def call_with_timeout(func, *args, timeout=1, **kwargs):
|
| 129 |
+
output_queue = multiprocessing.Queue()
|
| 130 |
+
process_args = args + (output_queue,)
|
| 131 |
+
process = multiprocessing.Process(target=func, args=process_args, kwargs=kwargs)
|
| 132 |
+
process.start()
|
| 133 |
+
process.join(timeout)
|
| 134 |
+
|
| 135 |
+
if process.is_alive():
|
| 136 |
+
process.terminate()
|
| 137 |
+
process.join()
|
| 138 |
+
return False
|
| 139 |
+
|
| 140 |
+
return output_queue.get()
|
| 141 |
+
|
nl_tasks/inference/gsm8k_infer.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
import jsonlines
|
| 5 |
+
from fraction import Fraction
|
| 6 |
+
from vllm import LLM, SamplingParams
|
| 7 |
+
import sys
|
| 8 |
+
from grader import math_equal
|
| 9 |
+
MAX_INT = sys.maxsize
|
| 10 |
+
MAX_TOKEN = 1024
|
| 11 |
+
|
| 12 |
+
import random
|
| 13 |
+
import numpy as np
|
| 14 |
+
import torch
|
| 15 |
+
import os
|
| 16 |
+
|
| 17 |
+
def is_number(s):
|
| 18 |
+
try:
|
| 19 |
+
float(s)
|
| 20 |
+
return True
|
| 21 |
+
except ValueError:
|
| 22 |
+
pass
|
| 23 |
+
try:
|
| 24 |
+
import unicodedata
|
| 25 |
+
unicodedata.numeric(s)
|
| 26 |
+
return True
|
| 27 |
+
except (TypeError, ValueError):
|
| 28 |
+
pass
|
| 29 |
+
return False
|
| 30 |
+
|
| 31 |
+
def extract_answer_number(completion):
|
| 32 |
+
text = completion.split('The answer is: ')
|
| 33 |
+
if len(text) > 1:
|
| 34 |
+
extract_ans = text[-1].strip()
|
| 35 |
+
match = re.search(r'[\-+]?\d*[\.,/]?\d+', extract_ans)
|
| 36 |
+
if match:
|
| 37 |
+
if '/' in match.group():
|
| 38 |
+
denominator = match.group().split('/')[1]
|
| 39 |
+
numerator = match.group().split('/')[0]
|
| 40 |
+
if is_number(denominator) == True and is_number(numerator) == True:
|
| 41 |
+
if denominator == '0':
|
| 42 |
+
return round(float(numerator.replace(',', '')))
|
| 43 |
+
else:
|
| 44 |
+
frac = Fraction(match.group().replace(',', ''))
|
| 45 |
+
num_numerator = frac.numerator
|
| 46 |
+
num_denominator = frac.denominator
|
| 47 |
+
return round(float(num_numerator / num_denominator))
|
| 48 |
+
else:
|
| 49 |
+
return None
|
| 50 |
+
else:
|
| 51 |
+
if float(match.group().replace(',', '')) == float('inf'):
|
| 52 |
+
return None
|
| 53 |
+
return round(float(match.group().replace(',', '')))
|
| 54 |
+
else:
|
| 55 |
+
return None
|
| 56 |
+
else:
|
| 57 |
+
return None
|
| 58 |
+
|
| 59 |
+
def batch_data(data_list, batch_size=1):
|
| 60 |
+
n = len(data_list) // batch_size
|
| 61 |
+
batch_data = []
|
| 62 |
+
for i in range(n-1):
|
| 63 |
+
start = i * batch_size
|
| 64 |
+
end = (i+1)*batch_size
|
| 65 |
+
batch_data.append(data_list[start:end])
|
| 66 |
+
|
| 67 |
+
last_start = (n-1) * batch_size
|
| 68 |
+
last_end = MAX_INT
|
| 69 |
+
batch_data.append(data_list[last_start:last_end])
|
| 70 |
+
return batch_data
|
| 71 |
+
|
| 72 |
+
def gsm8k_test(model, data_path, start=0, end=MAX_INT, batch_size=1, tensor_parallel_size=1):
|
| 73 |
+
INVALID_ANS = "[invalid]"
|
| 74 |
+
gsm8k_ins = []
|
| 75 |
+
gsm8k_answers = []
|
| 76 |
+
problem_prompt = (
|
| 77 |
+
"Below is an instruction that describes a task. "
|
| 78 |
+
"Write a response that appropriately completes the request.\n\n"
|
| 79 |
+
"### Instruction:\n{instruction}\n\n### Response: Let's think step by step."
|
| 80 |
+
)
|
| 81 |
+
print('prompt =====', problem_prompt)
|
| 82 |
+
with open(data_path,"r+", encoding="utf8") as f:
|
| 83 |
+
for idx, item in enumerate(jsonlines.Reader(f)):
|
| 84 |
+
temp_instr = problem_prompt.format(instruction=item["question"])
|
| 85 |
+
gsm8k_ins.append(temp_instr)
|
| 86 |
+
temp_ans = item['answer'].split('#### ')[1]
|
| 87 |
+
temp_ans = int(temp_ans.replace(',', ''))
|
| 88 |
+
gsm8k_answers.append(temp_ans)
|
| 89 |
+
|
| 90 |
+
gsm8k_ins = gsm8k_ins[start:end]
|
| 91 |
+
gsm8k_answers = gsm8k_answers[start:end]
|
| 92 |
+
print('lenght ====', len(gsm8k_ins))
|
| 93 |
+
# batch_gsm8k_ins = batch_data(gsm8k_ins, batch_size=batch_size)
|
| 94 |
+
|
| 95 |
+
stop_tokens = ["Instruction:", "Instruction", "Response:", "Response"]
|
| 96 |
+
sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=MAX_TOKEN, stop=stop_tokens)
|
| 97 |
+
print('sampleing =====', sampling_params)
|
| 98 |
+
llm = LLM(model=model,tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=0.90)
|
| 99 |
+
|
| 100 |
+
result = []
|
| 101 |
+
|
| 102 |
+
outputs = llm.generate(gsm8k_ins, sampling_params)
|
| 103 |
+
res_completions = [output.outputs[0].text for output in outputs]
|
| 104 |
+
|
| 105 |
+
invalid_outputs = []
|
| 106 |
+
for idx, (prompt, completion, prompt_answer) in enumerate(zip(gsm8k_ins, res_completions, gsm8k_answers)):
|
| 107 |
+
doc = {'question': prompt}
|
| 108 |
+
y_pred = extract_answer_number(completion)
|
| 109 |
+
# print('\n y_pred', y_pred, type(y_pred))
|
| 110 |
+
# print('ans', prompt_answer, type(prompt_answer))
|
| 111 |
+
if y_pred != None:
|
| 112 |
+
result.append(float(y_pred) == float(prompt_answer) or math_equal(y_pred, prompt_answer))
|
| 113 |
+
else:
|
| 114 |
+
result.append(False)
|
| 115 |
+
temp = {'question': prompt, 'output': completion, 'answer': prompt_answer}
|
| 116 |
+
invalid_outputs.append(temp)
|
| 117 |
+
|
| 118 |
+
acc = sum(result) / len(result)
|
| 119 |
+
print('len invalid outputs ====', len(invalid_outputs), ', invalid_outputs===', len(invalid_outputs))
|
| 120 |
+
# print('start===', start, ', end====', end)
|
| 121 |
+
print('gsm8k length====', len(result), ', gsm8k acc %====', acc*100)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
current_path = args.model
|
| 125 |
+
parent_dir = os.path.dirname(current_path.rstrip('/'))
|
| 126 |
+
output_filename = os.path.join(parent_dir, 'output.txt')
|
| 127 |
+
# output_filename = args.model + 'output.txt'
|
| 128 |
+
with open(output_filename, "a", encoding="utf-8") as f:
|
| 129 |
+
print(f'\n gsm8k MAX TOKEN = {MAX_TOKEN}, length==== {len(result)}, gsm8k acc %====, {acc*100}', file=f)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def parse_args():
|
| 133 |
+
parser = argparse.ArgumentParser()
|
| 134 |
+
parser.add_argument("--model", type=str) # model path
|
| 135 |
+
parser.add_argument("--data_file", type=str, default='data/gsm8k_test.jsonl') # data path
|
| 136 |
+
parser.add_argument("--start", type=int, default=0) #start index
|
| 137 |
+
parser.add_argument("--end", type=int, default=MAX_INT) # end index
|
| 138 |
+
parser.add_argument("--batch_size", type=int, default=60) # batch_size
|
| 139 |
+
parser.add_argument("--tensor_parallel_size", type=int, default=1) # tensor_parallel_size
|
| 140 |
+
return parser.parse_args()
|
| 141 |
+
|
| 142 |
+
def set_deterministic_seed(seed=42):
|
| 143 |
+
random.seed(seed)
|
| 144 |
+
np.random.seed(seed)
|
| 145 |
+
torch.manual_seed(seed)
|
| 146 |
+
torch.cuda.manual_seed_all(seed)
|
| 147 |
+
# torch.backends.cudnn.deterministic = True
|
| 148 |
+
# torch.backends.cudnn.benchmark = False
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
if __name__ == "__main__":
|
| 153 |
+
args = parse_args()
|
| 154 |
+
set_deterministic_seed()
|
| 155 |
+
gsm8k_test(model=args.model, data_path=args.data_file, start=args.start, end=args.end, batch_size=args.batch_size, tensor_parallel_size=args.tensor_parallel_size)
|
| 156 |
+
print('gsm ends', args.model)
|
| 157 |
+
|
nl_tasks/inference/util.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pprint
|
| 2 |
+
from grader import math_equal
|
| 3 |
+
|
| 4 |
+
def last_boxed_only(sample):
|
| 5 |
+
q, a = sample
|
| 6 |
+
a = last_boxed_only_string(a)
|
| 7 |
+
if a == None:
|
| 8 |
+
return None
|
| 9 |
+
return (q, a)
|
| 10 |
+
|
| 11 |
+
def last_boxed_only_string(string):
|
| 12 |
+
idx = string.rfind("\\boxed")
|
| 13 |
+
if idx < 0:
|
| 14 |
+
idx = string.rfind("\\fbox")
|
| 15 |
+
if idx < 0:
|
| 16 |
+
return None
|
| 17 |
+
|
| 18 |
+
i = idx
|
| 19 |
+
right_brace_idx = None
|
| 20 |
+
num_left_braces_open = 0
|
| 21 |
+
while i < len(string):
|
| 22 |
+
if string[i] == "{":
|
| 23 |
+
num_left_braces_open += 1
|
| 24 |
+
if string[i] == "}":
|
| 25 |
+
num_left_braces_open -= 1
|
| 26 |
+
if num_left_braces_open == 0:
|
| 27 |
+
right_brace_idx = i
|
| 28 |
+
break
|
| 29 |
+
i += 1
|
| 30 |
+
|
| 31 |
+
if right_brace_idx == None:
|
| 32 |
+
retval = None
|
| 33 |
+
else:
|
| 34 |
+
retval = string[idx:right_brace_idx + 1]
|
| 35 |
+
|
| 36 |
+
return retval
|
| 37 |
+
|
| 38 |
+
def only_until_first_boxed_from_tokens(string, tokens):
|
| 39 |
+
idx = string.find("\\boxed")
|
| 40 |
+
if idx < 0:
|
| 41 |
+
idx = string.find("\\fbox")
|
| 42 |
+
if idx < 0:
|
| 43 |
+
return None
|
| 44 |
+
|
| 45 |
+
cum_length = 0
|
| 46 |
+
for i, t in enumerate(tokens):
|
| 47 |
+
cum_length += len(t)
|
| 48 |
+
if cum_length >= idx:
|
| 49 |
+
break
|
| 50 |
+
|
| 51 |
+
return tokens[:i]
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def clean_numbers(sample):
|
| 56 |
+
if not sample:
|
| 57 |
+
return None
|
| 58 |
+
new_sample = list()
|
| 59 |
+
for s in sample:
|
| 60 |
+
new_sample.append(_clean_numbers(s))
|
| 61 |
+
|
| 62 |
+
return tuple(new_sample)
|
| 63 |
+
|
| 64 |
+
def _clean_numbers(string):
|
| 65 |
+
"""
|
| 66 |
+
Clean Numbers in the given string
|
| 67 |
+
|
| 68 |
+
>>> _clean_numbers(None, "Hello 123")
|
| 69 |
+
'Hello 123'
|
| 70 |
+
>>> _clean_numbers(None, "Hello 1234")
|
| 71 |
+
'Hello 1,234'
|
| 72 |
+
>>> _clean_numbers(None, "Hello 1234324asdasd")
|
| 73 |
+
'Hello 1,234,324asdasd'
|
| 74 |
+
"""
|
| 75 |
+
num_prev_digits = 0
|
| 76 |
+
new_string = ""
|
| 77 |
+
for i, c in enumerate(string):
|
| 78 |
+
# isdigit() doesnt work here because of weird unicode chars.
|
| 79 |
+
if c in {'1', '2', '3', '4', '5', '6', '7', '8', '9', '0'}:
|
| 80 |
+
num_prev_digits += 1
|
| 81 |
+
else:
|
| 82 |
+
if num_prev_digits > 3:
|
| 83 |
+
# Some fixing
|
| 84 |
+
string_number = new_string[-num_prev_digits:]
|
| 85 |
+
new_string = new_string[:-num_prev_digits] + "{0:,}".format(int(string_number))
|
| 86 |
+
num_prev_digits = 0
|
| 87 |
+
new_string += c
|
| 88 |
+
|
| 89 |
+
if num_prev_digits > 3:
|
| 90 |
+
# Some fixing
|
| 91 |
+
string_number = new_string[-num_prev_digits:]
|
| 92 |
+
new_string = new_string[:-num_prev_digits] + "{0:,}".format(int(string_number))
|
| 93 |
+
|
| 94 |
+
return new_string
|
| 95 |
+
|
| 96 |
+
def fix_fracs(string):
|
| 97 |
+
substrs = string.split("\\frac")
|
| 98 |
+
new_str = substrs[0]
|
| 99 |
+
if len(substrs) > 1:
|
| 100 |
+
substrs = substrs[1:]
|
| 101 |
+
for substr in substrs:
|
| 102 |
+
new_str += "\\frac"
|
| 103 |
+
if substr[0] == "{":
|
| 104 |
+
new_str += substr
|
| 105 |
+
else:
|
| 106 |
+
try:
|
| 107 |
+
assert len(substr) >= 2
|
| 108 |
+
except AssertionError:
|
| 109 |
+
return string
|
| 110 |
+
a = substr[0]
|
| 111 |
+
b = substr[1]
|
| 112 |
+
if b != "{":
|
| 113 |
+
if len(substr) > 2:
|
| 114 |
+
post_substr = substr[2:]
|
| 115 |
+
new_str += "{" + a + "}{" + b + "}" + post_substr
|
| 116 |
+
else:
|
| 117 |
+
new_str += "{" + a + "}{" + b + "}"
|
| 118 |
+
else:
|
| 119 |
+
if len(substr) > 2:
|
| 120 |
+
post_substr = substr[2:]
|
| 121 |
+
new_str += "{" + a + "}" + b + post_substr
|
| 122 |
+
else:
|
| 123 |
+
new_str += "{" + a + "}" + b
|
| 124 |
+
string = new_str
|
| 125 |
+
return string
|
| 126 |
+
|
| 127 |
+
def fix_a_slash_b(string):
|
| 128 |
+
if len(string.split("/")) != 2:
|
| 129 |
+
return string
|
| 130 |
+
a = string.split("/")[0]
|
| 131 |
+
b = string.split("/")[1]
|
| 132 |
+
try:
|
| 133 |
+
a = int(a)
|
| 134 |
+
b = int(b)
|
| 135 |
+
assert string == "{}/{}".format(a, b)
|
| 136 |
+
new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
|
| 137 |
+
return new_string
|
| 138 |
+
except AssertionError:
|
| 139 |
+
return string
|
| 140 |
+
|
| 141 |
+
def remove_right_units(string):
|
| 142 |
+
# "\\text{ " only ever occurs (at least in the val set) when describing units
|
| 143 |
+
if "\\text{ " in string:
|
| 144 |
+
splits = string.split("\\text{ ")
|
| 145 |
+
assert len(splits) == 2
|
| 146 |
+
return splits[0]
|
| 147 |
+
else:
|
| 148 |
+
return string
|
| 149 |
+
|
| 150 |
+
def fix_sqrt(string):
|
| 151 |
+
if "\\sqrt" not in string:
|
| 152 |
+
return string
|
| 153 |
+
splits = string.split("\\sqrt")
|
| 154 |
+
new_string = splits[0]
|
| 155 |
+
for split in splits[1:]:
|
| 156 |
+
if split[0] != "{":
|
| 157 |
+
a = split[0]
|
| 158 |
+
new_substr = "\\sqrt{" + a + "}" + split[1:]
|
| 159 |
+
else:
|
| 160 |
+
new_substr = "\\sqrt" + split
|
| 161 |
+
new_string += new_substr
|
| 162 |
+
return new_string
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def strip_string(string):
|
| 166 |
+
# linebreaks
|
| 167 |
+
string = string.replace("\n", "")
|
| 168 |
+
|
| 169 |
+
# remove inverse spaces
|
| 170 |
+
string = string.replace("\\!", "")
|
| 171 |
+
|
| 172 |
+
# replace \\ with \
|
| 173 |
+
string = string.replace("\\\\", "\\")
|
| 174 |
+
|
| 175 |
+
# replace tfrac and dfrac with frac
|
| 176 |
+
string = string.replace("tfrac", "frac")
|
| 177 |
+
string = string.replace("dfrac", "frac")
|
| 178 |
+
|
| 179 |
+
# remove \left and \right
|
| 180 |
+
string = string.replace("\\left", "")
|
| 181 |
+
string = string.replace("\\right", "")
|
| 182 |
+
|
| 183 |
+
# Remove circ (degrees)
|
| 184 |
+
string = string.replace("^{\\circ}", "")
|
| 185 |
+
string = string.replace("^\\circ", "")
|
| 186 |
+
|
| 187 |
+
# remove dollar signs
|
| 188 |
+
string = string.replace("\\$", "")
|
| 189 |
+
|
| 190 |
+
# remove units (on the right)
|
| 191 |
+
string = remove_right_units(string)
|
| 192 |
+
|
| 193 |
+
# remove percentage
|
| 194 |
+
string = string.replace("\\%", "")
|
| 195 |
+
string = string.replace("\%", "") # noqa: W605
|
| 196 |
+
|
| 197 |
+
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
|
| 198 |
+
string = string.replace(" .", " 0.")
|
| 199 |
+
string = string.replace("{.", "{0.")
|
| 200 |
+
# if empty, return empty string
|
| 201 |
+
if len(string) == 0:
|
| 202 |
+
return string
|
| 203 |
+
if string[0] == ".":
|
| 204 |
+
string = "0" + string
|
| 205 |
+
|
| 206 |
+
# to consider: get rid of e.g. "k = " or "q = " at beginning
|
| 207 |
+
if len(string.split("=")) == 2:
|
| 208 |
+
if len(string.split("=")[0]) <= 2:
|
| 209 |
+
string = string.split("=")[1]
|
| 210 |
+
|
| 211 |
+
# fix sqrt3 --> sqrt{3}
|
| 212 |
+
string = fix_sqrt(string)
|
| 213 |
+
|
| 214 |
+
# remove spaces
|
| 215 |
+
string = string.replace(" ", "")
|
| 216 |
+
|
| 217 |
+
# \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
|
| 218 |
+
string = fix_fracs(string)
|
| 219 |
+
|
| 220 |
+
# manually change 0.5 --> \frac{1}{2}
|
| 221 |
+
if string == "0.5":
|
| 222 |
+
string = "\\frac{1}{2}"
|
| 223 |
+
|
| 224 |
+
# NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
|
| 225 |
+
string = fix_a_slash_b(string)
|
| 226 |
+
|
| 227 |
+
return string
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def is_equiv(str1, str2, verbose=False):
|
| 231 |
+
if str1 is None and str2 is None:
|
| 232 |
+
print("WARNING: Both None")
|
| 233 |
+
return True
|
| 234 |
+
if str1 is None or str2 is None:
|
| 235 |
+
return False
|
| 236 |
+
|
| 237 |
+
try:
|
| 238 |
+
ss1 = strip_string(str1)
|
| 239 |
+
ss2 = strip_string(str2)
|
| 240 |
+
#pdb.set_trace()
|
| 241 |
+
if verbose:
|
| 242 |
+
print(ss1, ss2)
|
| 243 |
+
#return ss1 == ss2
|
| 244 |
+
res = math_equal(ss1,ss2) or ss1 == ss2
|
| 245 |
+
return res
|
| 246 |
+
except Exception:
|
| 247 |
+
#return str1 == str2
|
| 248 |
+
res = math_equal(str1,str1) or str1 == str2
|
| 249 |
+
return res
|
| 250 |
+
|
| 251 |
+
class NotEqual:
|
| 252 |
+
def __eq__(self, other):
|
| 253 |
+
return False
|