Commit
•
3999164
1
Parent(s):
69ea46d
Upload folder using huggingface_hub
Browse files- README.md +56 -0
- adapter_config.json +36 -0
- adapter_model.safetensors +3 -0
- added_tokens.json +5 -0
- all_results.json +7 -0
- merges.txt +0 -0
- special_tokens_map.json +20 -0
- tokenizer_config.json +44 -0
- train_results.json +7 -0
- trainer_log.jsonl +49 -0
- trainer_state.json +366 -0
- training_args.bin +3 -0
- training_loss.png +0 -0
- vocab.json +0 -0
README.md
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: other
|
3 |
+
library_name: peft
|
4 |
+
tags:
|
5 |
+
- llama-factory
|
6 |
+
- lora
|
7 |
+
- generated_from_trainer
|
8 |
+
base_model: gabrielmbmb/Upcycled-Qwen1.5-MoE2.7B
|
9 |
+
model-index:
|
10 |
+
- name: first
|
11 |
+
results: []
|
12 |
+
---
|
13 |
+
|
14 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
15 |
+
should probably proofread and complete it, then remove this comment. -->
|
16 |
+
|
17 |
+
# first
|
18 |
+
|
19 |
+
This model is a fine-tuned version of [gabrielmbmb/Upcycled-Qwen1.5-MoE2.7B](https://huggingface.co/gabrielmbmb/Upcycled-Qwen1.5-MoE2.7B) on the wiki_demo dataset.
|
20 |
+
|
21 |
+
## Model description
|
22 |
+
|
23 |
+
More information needed
|
24 |
+
|
25 |
+
## Intended uses & limitations
|
26 |
+
|
27 |
+
More information needed
|
28 |
+
|
29 |
+
## Training and evaluation data
|
30 |
+
|
31 |
+
More information needed
|
32 |
+
|
33 |
+
## Training procedure
|
34 |
+
|
35 |
+
### Training hyperparameters
|
36 |
+
|
37 |
+
The following hyperparameters were used during training:
|
38 |
+
- learning_rate: 5e-05
|
39 |
+
- train_batch_size: 8
|
40 |
+
- eval_batch_size: 8
|
41 |
+
- seed: 42
|
42 |
+
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
43 |
+
- lr_scheduler_type: cosine
|
44 |
+
- num_epochs: 3.0
|
45 |
+
|
46 |
+
### Training results
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
### Framework versions
|
51 |
+
|
52 |
+
- PEFT 0.10.0
|
53 |
+
- Transformers 4.40.0.dev0
|
54 |
+
- Pytorch 2.2.2+cu121
|
55 |
+
- Datasets 2.18.0
|
56 |
+
- Tokenizers 0.15.2
|
adapter_config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "gabrielmbmb/Upcycled-Qwen1.5-MoE2.7B",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 16,
|
14 |
+
"lora_dropout": 0.0,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 8,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"up_proj",
|
24 |
+
"gate_proj",
|
25 |
+
"v_proj",
|
26 |
+
"k_proj",
|
27 |
+
"gate",
|
28 |
+
"shared_expert_gate",
|
29 |
+
"down_proj",
|
30 |
+
"q_proj",
|
31 |
+
"o_proj"
|
32 |
+
],
|
33 |
+
"task_type": "CAUSAL_LM",
|
34 |
+
"use_dora": false,
|
35 |
+
"use_rslora": false
|
36 |
+
}
|
adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d4b53525787211fd3e8d6586fa4ef8315bfddb31d188e356cabd1f033e98bb6
|
3 |
+
size 507798304
|
added_tokens.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<|endoftext|>": 151643,
|
3 |
+
"<|im_end|>": 151645,
|
4 |
+
"<|im_start|>": 151644
|
5 |
+
}
|
all_results.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 3.0,
|
3 |
+
"train_loss": 4.515984590848287,
|
4 |
+
"train_runtime": 5513.8168,
|
5 |
+
"train_samples_per_second": 0.696,
|
6 |
+
"train_steps_per_second": 0.087
|
7 |
+
}
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
special_tokens_map.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|im_start|>",
|
4 |
+
"<|im_end|>"
|
5 |
+
],
|
6 |
+
"eos_token": {
|
7 |
+
"content": "<|endoftext|>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": false,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false
|
12 |
+
},
|
13 |
+
"pad_token": {
|
14 |
+
"content": "<|endoftext|>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false
|
19 |
+
}
|
20 |
+
}
|
tokenizer_config.json
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"151643": {
|
5 |
+
"content": "<|endoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": false,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
},
|
12 |
+
"151644": {
|
13 |
+
"content": "<|im_start|>",
|
14 |
+
"lstrip": false,
|
15 |
+
"normalized": false,
|
16 |
+
"rstrip": false,
|
17 |
+
"single_word": false,
|
18 |
+
"special": true
|
19 |
+
},
|
20 |
+
"151645": {
|
21 |
+
"content": "<|im_end|>",
|
22 |
+
"lstrip": false,
|
23 |
+
"normalized": false,
|
24 |
+
"rstrip": false,
|
25 |
+
"single_word": false,
|
26 |
+
"special": true
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"additional_special_tokens": [
|
30 |
+
"<|im_start|>",
|
31 |
+
"<|im_end|>"
|
32 |
+
],
|
33 |
+
"bos_token": null,
|
34 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}",
|
35 |
+
"clean_up_tokenization_spaces": false,
|
36 |
+
"eos_token": "<|endoftext|>",
|
37 |
+
"errors": "replace",
|
38 |
+
"model_max_length": 32768,
|
39 |
+
"pad_token": "<|endoftext|>",
|
40 |
+
"padding_side": "right",
|
41 |
+
"split_special_tokens": false,
|
42 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
43 |
+
"unk_token": null
|
44 |
+
}
|
train_results.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 3.0,
|
3 |
+
"train_loss": 4.515984590848287,
|
4 |
+
"train_runtime": 5513.8168,
|
5 |
+
"train_samples_per_second": 0.696,
|
6 |
+
"train_steps_per_second": 0.087
|
7 |
+
}
|
trainer_log.jsonl
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"current_steps": 10, "total_steps": 480, "loss": 8.9503, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.994647308096509e-05, "epoch": 0.06, "percentage": 2.08, "elapsed_time": "0:01:45", "remaining_time": "1:22:28"}
|
2 |
+
{"current_steps": 20, "total_steps": 480, "loss": 8.0851, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9786121534345265e-05, "epoch": 0.12, "percentage": 4.17, "elapsed_time": "0:03:35", "remaining_time": "1:22:28"}
|
3 |
+
{"current_steps": 30, "total_steps": 480, "loss": 7.5574, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.951963201008076e-05, "epoch": 0.19, "percentage": 6.25, "elapsed_time": "0:05:25", "remaining_time": "1:21:16"}
|
4 |
+
{"current_steps": 40, "total_steps": 480, "loss": 6.9461, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.914814565722671e-05, "epoch": 0.25, "percentage": 8.33, "elapsed_time": "0:07:16", "remaining_time": "1:20:00"}
|
5 |
+
{"current_steps": 50, "total_steps": 480, "loss": 6.1526, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.867325323737765e-05, "epoch": 0.31, "percentage": 10.42, "elapsed_time": "0:09:07", "remaining_time": "1:18:27"}
|
6 |
+
{"current_steps": 60, "total_steps": 480, "loss": 5.4116, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8096988312782174e-05, "epoch": 0.38, "percentage": 12.5, "elapsed_time": "0:10:55", "remaining_time": "1:16:25"}
|
7 |
+
{"current_steps": 70, "total_steps": 480, "loss": 5.0721, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.742181853831721e-05, "epoch": 0.44, "percentage": 14.58, "elapsed_time": "0:12:44", "remaining_time": "1:14:36"}
|
8 |
+
{"current_steps": 80, "total_steps": 480, "loss": 4.7695, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.665063509461097e-05, "epoch": 0.5, "percentage": 16.67, "elapsed_time": "0:14:35", "remaining_time": "1:12:58"}
|
9 |
+
{"current_steps": 90, "total_steps": 480, "loss": 4.6179, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5786740307563636e-05, "epoch": 0.56, "percentage": 18.75, "elapsed_time": "0:16:28", "remaining_time": "1:11:21"}
|
10 |
+
{"current_steps": 100, "total_steps": 480, "loss": 4.4848, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4833833507280884e-05, "epoch": 0.62, "percentage": 20.83, "elapsed_time": "0:18:21", "remaining_time": "1:09:43"}
|
11 |
+
{"current_steps": 110, "total_steps": 480, "loss": 4.3627, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.379599518697444e-05, "epoch": 0.69, "percentage": 22.92, "elapsed_time": "0:20:14", "remaining_time": "1:08:05"}
|
12 |
+
{"current_steps": 120, "total_steps": 480, "loss": 4.3319, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.267766952966369e-05, "epoch": 0.75, "percentage": 25.0, "elapsed_time": "0:22:08", "remaining_time": "1:06:26"}
|
13 |
+
{"current_steps": 130, "total_steps": 480, "loss": 4.269, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.148364537750172e-05, "epoch": 0.81, "percentage": 27.08, "elapsed_time": "0:24:02", "remaining_time": "1:04:44"}
|
14 |
+
{"current_steps": 140, "total_steps": 480, "loss": 4.2557, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.021903572521802e-05, "epoch": 0.88, "percentage": 29.17, "elapsed_time": "0:25:56", "remaining_time": "1:03:00"}
|
15 |
+
{"current_steps": 150, "total_steps": 480, "loss": 4.1534, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.888925582549006e-05, "epoch": 0.94, "percentage": 31.25, "elapsed_time": "0:27:51", "remaining_time": "1:01:17"}
|
16 |
+
{"current_steps": 160, "total_steps": 480, "loss": 4.2017, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7500000000000003e-05, "epoch": 1.0, "percentage": 33.33, "elapsed_time": "0:29:45", "remaining_time": "0:59:30"}
|
17 |
+
{"current_steps": 170, "total_steps": 480, "loss": 4.1732, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6057217255475034e-05, "epoch": 1.06, "percentage": 35.42, "elapsed_time": "0:31:39", "remaining_time": "0:57:44"}
|
18 |
+
{"current_steps": 180, "total_steps": 480, "loss": 4.1358, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.456708580912725e-05, "epoch": 1.12, "percentage": 37.5, "elapsed_time": "0:33:33", "remaining_time": "0:55:56"}
|
19 |
+
{"current_steps": 190, "total_steps": 480, "loss": 4.1514, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.303598663257904e-05, "epoch": 1.19, "percentage": 39.58, "elapsed_time": "0:35:29", "remaining_time": "0:54:10"}
|
20 |
+
{"current_steps": 200, "total_steps": 480, "loss": 4.0925, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.147047612756302e-05, "epoch": 1.25, "percentage": 41.67, "elapsed_time": "0:37:25", "remaining_time": "0:52:23"}
|
21 |
+
{"current_steps": 210, "total_steps": 480, "loss": 4.0496, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9877258050403212e-05, "epoch": 1.31, "percentage": 43.75, "elapsed_time": "0:39:20", "remaining_time": "0:50:35"}
|
22 |
+
{"current_steps": 220, "total_steps": 480, "loss": 4.0853, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8263154805501297e-05, "epoch": 1.38, "percentage": 45.83, "elapsed_time": "0:41:16", "remaining_time": "0:48:46"}
|
23 |
+
{"current_steps": 230, "total_steps": 480, "loss": 4.1283, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.663507823075358e-05, "epoch": 1.44, "percentage": 47.92, "elapsed_time": "0:43:11", "remaining_time": "0:46:56"}
|
24 |
+
{"current_steps": 240, "total_steps": 480, "loss": 4.0567, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.5e-05, "epoch": 1.5, "percentage": 50.0, "elapsed_time": "0:45:07", "remaining_time": "0:45:07"}
|
25 |
+
{"current_steps": 250, "total_steps": 480, "loss": 4.0141, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.3364921769246423e-05, "epoch": 1.56, "percentage": 52.08, "elapsed_time": "0:47:03", "remaining_time": "0:43:17"}
|
26 |
+
{"current_steps": 260, "total_steps": 480, "loss": 4.0356, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.173684519449872e-05, "epoch": 1.62, "percentage": 54.17, "elapsed_time": "0:48:59", "remaining_time": "0:41:27"}
|
27 |
+
{"current_steps": 270, "total_steps": 480, "loss": 4.082, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0122741949596797e-05, "epoch": 1.69, "percentage": 56.25, "elapsed_time": "0:50:56", "remaining_time": "0:39:36"}
|
28 |
+
{"current_steps": 280, "total_steps": 480, "loss": 4.0767, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.852952387243698e-05, "epoch": 1.75, "percentage": 58.33, "elapsed_time": "0:52:52", "remaining_time": "0:37:46"}
|
29 |
+
{"current_steps": 290, "total_steps": 480, "loss": 3.9817, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6964013367420966e-05, "epoch": 1.81, "percentage": 60.42, "elapsed_time": "0:54:49", "remaining_time": "0:35:55"}
|
30 |
+
{"current_steps": 300, "total_steps": 480, "loss": 4.0618, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5432914190872757e-05, "epoch": 1.88, "percentage": 62.5, "elapsed_time": "0:56:46", "remaining_time": "0:34:03"}
|
31 |
+
{"current_steps": 310, "total_steps": 480, "loss": 4.0106, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3942782744524973e-05, "epoch": 1.94, "percentage": 64.58, "elapsed_time": "0:58:43", "remaining_time": "0:32:12"}
|
32 |
+
{"current_steps": 320, "total_steps": 480, "loss": 4.0549, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2500000000000006e-05, "epoch": 2.0, "percentage": 66.67, "elapsed_time": "1:00:39", "remaining_time": "0:30:19"}
|
33 |
+
{"current_steps": 330, "total_steps": 480, "loss": 3.919, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1110744174509952e-05, "epoch": 2.06, "percentage": 68.75, "elapsed_time": "1:02:36", "remaining_time": "0:28:27"}
|
34 |
+
{"current_steps": 340, "total_steps": 480, "loss": 4.053, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.780964274781984e-06, "epoch": 2.12, "percentage": 70.83, "elapsed_time": "1:04:33", "remaining_time": "0:26:34"}
|
35 |
+
{"current_steps": 350, "total_steps": 480, "loss": 4.0032, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.51635462249828e-06, "epoch": 2.19, "percentage": 72.92, "elapsed_time": "1:06:30", "remaining_time": "0:24:42"}
|
36 |
+
{"current_steps": 360, "total_steps": 480, "loss": 3.991, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.3223304703363135e-06, "epoch": 2.25, "percentage": 75.0, "elapsed_time": "1:08:27", "remaining_time": "0:22:49"}
|
37 |
+
{"current_steps": 370, "total_steps": 480, "loss": 3.9941, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.204004813025568e-06, "epoch": 2.31, "percentage": 77.08, "elapsed_time": "1:10:25", "remaining_time": "0:20:56"}
|
38 |
+
{"current_steps": 380, "total_steps": 480, "loss": 4.0388, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.166166492719124e-06, "epoch": 2.38, "percentage": 79.17, "elapsed_time": "1:12:22", "remaining_time": "0:19:02"}
|
39 |
+
{"current_steps": 390, "total_steps": 480, "loss": 4.0767, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.213259692436367e-06, "epoch": 2.44, "percentage": 81.25, "elapsed_time": "1:14:20", "remaining_time": "0:17:09"}
|
40 |
+
{"current_steps": 400, "total_steps": 480, "loss": 3.9709, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3493649053890326e-06, "epoch": 2.5, "percentage": 83.33, "elapsed_time": "1:16:16", "remaining_time": "0:15:15"}
|
41 |
+
{"current_steps": 410, "total_steps": 480, "loss": 3.9714, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.578181461682794e-06, "epoch": 2.56, "percentage": 85.42, "elapsed_time": "1:18:12", "remaining_time": "0:13:21"}
|
42 |
+
{"current_steps": 420, "total_steps": 480, "loss": 4.0366, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9030116872178316e-06, "epoch": 2.62, "percentage": 87.5, "elapsed_time": "1:20:10", "remaining_time": "0:11:27"}
|
43 |
+
{"current_steps": 430, "total_steps": 480, "loss": 3.9307, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3267467626223606e-06, "epoch": 2.69, "percentage": 89.58, "elapsed_time": "1:22:07", "remaining_time": "0:09:33"}
|
44 |
+
{"current_steps": 440, "total_steps": 480, "loss": 4.0751, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.51854342773295e-07, "epoch": 2.75, "percentage": 91.67, "elapsed_time": "1:24:05", "remaining_time": "0:07:38"}
|
45 |
+
{"current_steps": 450, "total_steps": 480, "loss": 4.0141, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.803679899192392e-07, "epoch": 2.81, "percentage": 93.75, "elapsed_time": "1:26:02", "remaining_time": "0:05:44"}
|
46 |
+
{"current_steps": 460, "total_steps": 480, "loss": 3.9676, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.1387846565474045e-07, "epoch": 2.88, "percentage": 95.83, "elapsed_time": "1:27:59", "remaining_time": "0:03:49"}
|
47 |
+
{"current_steps": 470, "total_steps": 480, "loss": 4.0041, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.352691903491303e-08, "epoch": 2.94, "percentage": 97.92, "elapsed_time": "1:29:56", "remaining_time": "0:01:54"}
|
48 |
+
{"current_steps": 480, "total_steps": 480, "loss": 3.909, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "1:31:52", "remaining_time": "0:00:00"}
|
49 |
+
{"current_steps": 480, "total_steps": 480, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "1:31:52", "remaining_time": "0:00:00"}
|
trainer_state.json
ADDED
@@ -0,0 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 3.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 480,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.06,
|
13 |
+
"grad_norm": 4.709561347961426,
|
14 |
+
"learning_rate": 4.994647308096509e-05,
|
15 |
+
"loss": 8.9503,
|
16 |
+
"step": 10
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.12,
|
20 |
+
"grad_norm": 2.5953965187072754,
|
21 |
+
"learning_rate": 4.9786121534345265e-05,
|
22 |
+
"loss": 8.0851,
|
23 |
+
"step": 20
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.19,
|
27 |
+
"grad_norm": 2.367886543273926,
|
28 |
+
"learning_rate": 4.951963201008076e-05,
|
29 |
+
"loss": 7.5574,
|
30 |
+
"step": 30
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.25,
|
34 |
+
"grad_norm": 3.426809549331665,
|
35 |
+
"learning_rate": 4.914814565722671e-05,
|
36 |
+
"loss": 6.9461,
|
37 |
+
"step": 40
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.31,
|
41 |
+
"grad_norm": 4.523465633392334,
|
42 |
+
"learning_rate": 4.867325323737765e-05,
|
43 |
+
"loss": 6.1526,
|
44 |
+
"step": 50
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.38,
|
48 |
+
"grad_norm": 1.7781277894973755,
|
49 |
+
"learning_rate": 4.8096988312782174e-05,
|
50 |
+
"loss": 5.4116,
|
51 |
+
"step": 60
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.44,
|
55 |
+
"grad_norm": 1.3401896953582764,
|
56 |
+
"learning_rate": 4.742181853831721e-05,
|
57 |
+
"loss": 5.0721,
|
58 |
+
"step": 70
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.5,
|
62 |
+
"grad_norm": 0.9671522378921509,
|
63 |
+
"learning_rate": 4.665063509461097e-05,
|
64 |
+
"loss": 4.7695,
|
65 |
+
"step": 80
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.56,
|
69 |
+
"grad_norm": 1.055595874786377,
|
70 |
+
"learning_rate": 4.5786740307563636e-05,
|
71 |
+
"loss": 4.6179,
|
72 |
+
"step": 90
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.62,
|
76 |
+
"grad_norm": 0.7612900733947754,
|
77 |
+
"learning_rate": 4.4833833507280884e-05,
|
78 |
+
"loss": 4.4848,
|
79 |
+
"step": 100
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.69,
|
83 |
+
"grad_norm": 0.7046266198158264,
|
84 |
+
"learning_rate": 4.379599518697444e-05,
|
85 |
+
"loss": 4.3627,
|
86 |
+
"step": 110
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.75,
|
90 |
+
"grad_norm": 0.9548586010932922,
|
91 |
+
"learning_rate": 4.267766952966369e-05,
|
92 |
+
"loss": 4.3319,
|
93 |
+
"step": 120
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.81,
|
97 |
+
"grad_norm": 0.6869709491729736,
|
98 |
+
"learning_rate": 4.148364537750172e-05,
|
99 |
+
"loss": 4.269,
|
100 |
+
"step": 130
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.88,
|
104 |
+
"grad_norm": 0.6282512545585632,
|
105 |
+
"learning_rate": 4.021903572521802e-05,
|
106 |
+
"loss": 4.2557,
|
107 |
+
"step": 140
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.94,
|
111 |
+
"grad_norm": 0.5287710428237915,
|
112 |
+
"learning_rate": 3.888925582549006e-05,
|
113 |
+
"loss": 4.1534,
|
114 |
+
"step": 150
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 1.0,
|
118 |
+
"grad_norm": 0.7542024850845337,
|
119 |
+
"learning_rate": 3.7500000000000003e-05,
|
120 |
+
"loss": 4.2017,
|
121 |
+
"step": 160
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 1.06,
|
125 |
+
"grad_norm": 0.49876415729522705,
|
126 |
+
"learning_rate": 3.6057217255475034e-05,
|
127 |
+
"loss": 4.1732,
|
128 |
+
"step": 170
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 1.12,
|
132 |
+
"grad_norm": 0.5626935958862305,
|
133 |
+
"learning_rate": 3.456708580912725e-05,
|
134 |
+
"loss": 4.1358,
|
135 |
+
"step": 180
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 1.19,
|
139 |
+
"grad_norm": 0.493310809135437,
|
140 |
+
"learning_rate": 3.303598663257904e-05,
|
141 |
+
"loss": 4.1514,
|
142 |
+
"step": 190
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 1.25,
|
146 |
+
"grad_norm": 0.4654150605201721,
|
147 |
+
"learning_rate": 3.147047612756302e-05,
|
148 |
+
"loss": 4.0925,
|
149 |
+
"step": 200
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 1.31,
|
153 |
+
"grad_norm": 0.592623770236969,
|
154 |
+
"learning_rate": 2.9877258050403212e-05,
|
155 |
+
"loss": 4.0496,
|
156 |
+
"step": 210
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"epoch": 1.38,
|
160 |
+
"grad_norm": 0.5564578771591187,
|
161 |
+
"learning_rate": 2.8263154805501297e-05,
|
162 |
+
"loss": 4.0853,
|
163 |
+
"step": 220
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"epoch": 1.44,
|
167 |
+
"grad_norm": 0.6952773332595825,
|
168 |
+
"learning_rate": 2.663507823075358e-05,
|
169 |
+
"loss": 4.1283,
|
170 |
+
"step": 230
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"epoch": 1.5,
|
174 |
+
"grad_norm": 0.5385617613792419,
|
175 |
+
"learning_rate": 2.5e-05,
|
176 |
+
"loss": 4.0567,
|
177 |
+
"step": 240
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 1.56,
|
181 |
+
"grad_norm": 0.5663427114486694,
|
182 |
+
"learning_rate": 2.3364921769246423e-05,
|
183 |
+
"loss": 4.0141,
|
184 |
+
"step": 250
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"epoch": 1.62,
|
188 |
+
"grad_norm": 0.5520788431167603,
|
189 |
+
"learning_rate": 2.173684519449872e-05,
|
190 |
+
"loss": 4.0356,
|
191 |
+
"step": 260
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"epoch": 1.69,
|
195 |
+
"grad_norm": 0.5162128806114197,
|
196 |
+
"learning_rate": 2.0122741949596797e-05,
|
197 |
+
"loss": 4.082,
|
198 |
+
"step": 270
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 1.75,
|
202 |
+
"grad_norm": 0.5291630625724792,
|
203 |
+
"learning_rate": 1.852952387243698e-05,
|
204 |
+
"loss": 4.0767,
|
205 |
+
"step": 280
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"epoch": 1.81,
|
209 |
+
"grad_norm": 0.6226648092269897,
|
210 |
+
"learning_rate": 1.6964013367420966e-05,
|
211 |
+
"loss": 3.9817,
|
212 |
+
"step": 290
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"epoch": 1.88,
|
216 |
+
"grad_norm": 0.5460664629936218,
|
217 |
+
"learning_rate": 1.5432914190872757e-05,
|
218 |
+
"loss": 4.0618,
|
219 |
+
"step": 300
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"epoch": 1.94,
|
223 |
+
"grad_norm": 0.7545162439346313,
|
224 |
+
"learning_rate": 1.3942782744524973e-05,
|
225 |
+
"loss": 4.0106,
|
226 |
+
"step": 310
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"epoch": 2.0,
|
230 |
+
"grad_norm": 0.6207989454269409,
|
231 |
+
"learning_rate": 1.2500000000000006e-05,
|
232 |
+
"loss": 4.0549,
|
233 |
+
"step": 320
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"epoch": 2.06,
|
237 |
+
"grad_norm": 0.5338532328605652,
|
238 |
+
"learning_rate": 1.1110744174509952e-05,
|
239 |
+
"loss": 3.919,
|
240 |
+
"step": 330
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"epoch": 2.12,
|
244 |
+
"grad_norm": 0.5484297275543213,
|
245 |
+
"learning_rate": 9.780964274781984e-06,
|
246 |
+
"loss": 4.053,
|
247 |
+
"step": 340
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"epoch": 2.19,
|
251 |
+
"grad_norm": 0.6356564164161682,
|
252 |
+
"learning_rate": 8.51635462249828e-06,
|
253 |
+
"loss": 4.0032,
|
254 |
+
"step": 350
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"epoch": 2.25,
|
258 |
+
"grad_norm": 0.5518457889556885,
|
259 |
+
"learning_rate": 7.3223304703363135e-06,
|
260 |
+
"loss": 3.991,
|
261 |
+
"step": 360
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"epoch": 2.31,
|
265 |
+
"grad_norm": 0.5176472067832947,
|
266 |
+
"learning_rate": 6.204004813025568e-06,
|
267 |
+
"loss": 3.9941,
|
268 |
+
"step": 370
|
269 |
+
},
|
270 |
+
{
|
271 |
+
"epoch": 2.38,
|
272 |
+
"grad_norm": 0.5543831586837769,
|
273 |
+
"learning_rate": 5.166166492719124e-06,
|
274 |
+
"loss": 4.0388,
|
275 |
+
"step": 380
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"epoch": 2.44,
|
279 |
+
"grad_norm": 0.5504453182220459,
|
280 |
+
"learning_rate": 4.213259692436367e-06,
|
281 |
+
"loss": 4.0767,
|
282 |
+
"step": 390
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"epoch": 2.5,
|
286 |
+
"grad_norm": 0.5619158148765564,
|
287 |
+
"learning_rate": 3.3493649053890326e-06,
|
288 |
+
"loss": 3.9709,
|
289 |
+
"step": 400
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"epoch": 2.56,
|
293 |
+
"grad_norm": 0.5513697266578674,
|
294 |
+
"learning_rate": 2.578181461682794e-06,
|
295 |
+
"loss": 3.9714,
|
296 |
+
"step": 410
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"epoch": 2.62,
|
300 |
+
"grad_norm": 0.590857744216919,
|
301 |
+
"learning_rate": 1.9030116872178316e-06,
|
302 |
+
"loss": 4.0366,
|
303 |
+
"step": 420
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"epoch": 2.69,
|
307 |
+
"grad_norm": 0.5728959441184998,
|
308 |
+
"learning_rate": 1.3267467626223606e-06,
|
309 |
+
"loss": 3.9307,
|
310 |
+
"step": 430
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"epoch": 2.75,
|
314 |
+
"grad_norm": 0.5163474082946777,
|
315 |
+
"learning_rate": 8.51854342773295e-07,
|
316 |
+
"loss": 4.0751,
|
317 |
+
"step": 440
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"epoch": 2.81,
|
321 |
+
"grad_norm": 0.5247732996940613,
|
322 |
+
"learning_rate": 4.803679899192392e-07,
|
323 |
+
"loss": 4.0141,
|
324 |
+
"step": 450
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"epoch": 2.88,
|
328 |
+
"grad_norm": 0.5492649674415588,
|
329 |
+
"learning_rate": 2.1387846565474045e-07,
|
330 |
+
"loss": 3.9676,
|
331 |
+
"step": 460
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"epoch": 2.94,
|
335 |
+
"grad_norm": 0.5566267371177673,
|
336 |
+
"learning_rate": 5.352691903491303e-08,
|
337 |
+
"loss": 4.0041,
|
338 |
+
"step": 470
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"epoch": 3.0,
|
342 |
+
"grad_norm": 0.6199953556060791,
|
343 |
+
"learning_rate": 0.0,
|
344 |
+
"loss": 3.909,
|
345 |
+
"step": 480
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"epoch": 3.0,
|
349 |
+
"step": 480,
|
350 |
+
"total_flos": 3.260181978788659e+17,
|
351 |
+
"train_loss": 4.515984590848287,
|
352 |
+
"train_runtime": 5513.8168,
|
353 |
+
"train_samples_per_second": 0.696,
|
354 |
+
"train_steps_per_second": 0.087
|
355 |
+
}
|
356 |
+
],
|
357 |
+
"logging_steps": 10,
|
358 |
+
"max_steps": 480,
|
359 |
+
"num_input_tokens_seen": 0,
|
360 |
+
"num_train_epochs": 3,
|
361 |
+
"save_steps": 1000,
|
362 |
+
"total_flos": 3.260181978788659e+17,
|
363 |
+
"train_batch_size": 8,
|
364 |
+
"trial_name": null,
|
365 |
+
"trial_params": null
|
366 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8ee90bb77243c24436e2e183b8a39237b082d3ff29adac639257c4ee220a498b
|
3 |
+
size 5112
|
training_loss.png
ADDED
![]() |
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|