ParagonLight commited on
Commit
0a8e533
1 Parent(s): c6dd428

upload loras

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. llama2_13b_peft/abstract_narrative_understanding/README.md +78 -0
  2. llama2_13b_peft/abstract_narrative_understanding/adapter_config.json +33 -0
  3. llama2_13b_peft/abstract_narrative_understanding/adapter_model.safetensors +3 -0
  4. llama2_13b_peft/abstract_narrative_understanding/all_results.json +11 -0
  5. llama2_13b_peft/abstract_narrative_understanding/eval_results.json +7 -0
  6. llama2_13b_peft/abstract_narrative_understanding/special_tokens_map.json +24 -0
  7. llama2_13b_peft/abstract_narrative_understanding/tokenizer.model +3 -0
  8. llama2_13b_peft/abstract_narrative_understanding/tokenizer_config.json +45 -0
  9. llama2_13b_peft/abstract_narrative_understanding/train_results.json +7 -0
  10. llama2_13b_peft/abstract_narrative_understanding/trainer_log.jsonl +80 -0
  11. llama2_13b_peft/abstract_narrative_understanding/trainer_state.json +589 -0
  12. llama2_13b_peft/abstract_narrative_understanding/training_args.bin +3 -0
  13. llama2_13b_peft/abstract_narrative_understanding/training_eval_loss.png +0 -0
  14. llama2_13b_peft/abstract_narrative_understanding/training_loss.png +0 -0
  15. llama2_13b_peft/alpaca/README.md +81 -0
  16. llama2_13b_peft/alpaca/adapter_config.json +33 -0
  17. llama2_13b_peft/alpaca/adapter_model.safetensors +3 -0
  18. llama2_13b_peft/alpaca/all_results.json +11 -0
  19. llama2_13b_peft/alpaca/eval_results.json +7 -0
  20. llama2_13b_peft/alpaca/special_tokens_map.json +24 -0
  21. llama2_13b_peft/alpaca/tokenizer.model +3 -0
  22. llama2_13b_peft/alpaca/tokenizer_config.json +45 -0
  23. llama2_13b_peft/alpaca/train_results.json +7 -0
  24. llama2_13b_peft/alpaca/trainer_log.jsonl +338 -0
  25. llama2_13b_peft/alpaca/trainer_state.json +2398 -0
  26. llama2_13b_peft/alpaca/training_args.bin +3 -0
  27. llama2_13b_peft/alpaca/training_eval_loss.png +0 -0
  28. llama2_13b_peft/alpaca/training_loss.png +0 -0
  29. llama2_13b_peft/cnn_dailymail/README.md +82 -0
  30. llama2_13b_peft/cnn_dailymail/adapter_config.json +33 -0
  31. llama2_13b_peft/cnn_dailymail/adapter_model.safetensors +3 -0
  32. llama2_13b_peft/cnn_dailymail/all_results.json +11 -0
  33. llama2_13b_peft/cnn_dailymail/eval_results.json +7 -0
  34. llama2_13b_peft/cnn_dailymail/special_tokens_map.json +24 -0
  35. llama2_13b_peft/cnn_dailymail/tokenizer.model +3 -0
  36. llama2_13b_peft/cnn_dailymail/tokenizer_config.json +45 -0
  37. llama2_13b_peft/cnn_dailymail/train_results.json +7 -0
  38. llama2_13b_peft/cnn_dailymail/trainer_log.jsonl +104 -0
  39. llama2_13b_peft/cnn_dailymail/trainer_state.json +761 -0
  40. llama2_13b_peft/cnn_dailymail/training_args.bin +3 -0
  41. llama2_13b_peft/cnn_dailymail/training_eval_loss.png +0 -0
  42. llama2_13b_peft/cnn_dailymail/training_loss.png +0 -0
  43. llama2_13b_peft/contextual_parametric_knowledge_conflicts/README.md +78 -0
  44. llama2_13b_peft/contextual_parametric_knowledge_conflicts/adapter_config.json +33 -0
  45. llama2_13b_peft/contextual_parametric_knowledge_conflicts/adapter_model.safetensors +3 -0
  46. llama2_13b_peft/contextual_parametric_knowledge_conflicts/all_results.json +11 -0
  47. llama2_13b_peft/contextual_parametric_knowledge_conflicts/eval_results.json +7 -0
  48. llama2_13b_peft/contextual_parametric_knowledge_conflicts/special_tokens_map.json +24 -0
  49. llama2_13b_peft/contextual_parametric_knowledge_conflicts/tokenizer.model +3 -0
  50. llama2_13b_peft/contextual_parametric_knowledge_conflicts/tokenizer_config.json +45 -0
llama2_13b_peft/abstract_narrative_understanding/README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - llama-factory
6
+ - lora
7
+ - generated_from_trainer
8
+ base_model: /data1/model/llama2/meta-llama/Llama2-13b
9
+ model-index:
10
+ - name: abstract_narrative_understanding_no_sys
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # abstract_narrative_understanding_no_sys
18
+
19
+ This model is a fine-tuned version of [/data1/model/llama2/meta-llama/Llama2-13b](https://huggingface.co//data1/model/llama2/meta-llama/Llama2-13b) on the abstract_narrative_understanding_no_sys dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.0775
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 0.0001
41
+ - train_batch_size: 8
42
+ - eval_batch_size: 8
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 3
46
+ - total_train_batch_size: 24
47
+ - total_eval_batch_size: 24
48
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
+ - lr_scheduler_type: cosine
50
+ - lr_scheduler_warmup_steps: 20
51
+ - num_epochs: 10.0
52
+
53
+ ### Training results
54
+
55
+ | Training Loss | Epoch | Step | Validation Loss |
56
+ |:-------------:|:-----:|:----:|:---------------:|
57
+ | 0.1783 | 0.59 | 50 | 0.1586 |
58
+ | 0.076 | 1.18 | 100 | 0.1166 |
59
+ | 0.0666 | 1.76 | 150 | 0.0825 |
60
+ | 0.0173 | 2.35 | 200 | 0.0773 |
61
+ | 0.0269 | 2.94 | 250 | 0.0836 |
62
+ | 0.0066 | 3.53 | 300 | 0.0656 |
63
+ | 0.003 | 4.12 | 350 | 0.0645 |
64
+ | 0.0027 | 4.71 | 400 | 0.0729 |
65
+ | 0.0011 | 5.29 | 450 | 0.0678 |
66
+ | 0.0009 | 5.88 | 500 | 0.0775 |
67
+ | 0.0017 | 6.47 | 550 | 0.0782 |
68
+ | 0.0001 | 7.06 | 600 | 0.0793 |
69
+ | 0.0001 | 7.65 | 650 | 0.0812 |
70
+
71
+
72
+ ### Framework versions
73
+
74
+ - PEFT 0.9.0
75
+ - Transformers 4.38.2
76
+ - Pytorch 2.2.1
77
+ - Datasets 2.18.0
78
+ - Tokenizers 0.15.2
llama2_13b_peft/abstract_narrative_understanding/adapter_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/data1/model/llama2/meta-llama/Llama2-13b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.0,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 8,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "k_proj",
23
+ "v_proj",
24
+ "down_proj",
25
+ "up_proj",
26
+ "o_proj",
27
+ "gate_proj",
28
+ "q_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "use_dora": false,
32
+ "use_rslora": false
33
+ }
llama2_13b_peft/abstract_narrative_understanding/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c278b153aa2effab442633e3e881c97d93a4b8bfa4b91a9121708bc0eb844274
3
+ size 125248064
llama2_13b_peft/abstract_narrative_understanding/all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 7.65,
3
+ "eval_loss": 0.0775449350476265,
4
+ "eval_runtime": 18.6993,
5
+ "eval_samples_per_second": 19.252,
6
+ "eval_steps_per_second": 0.802,
7
+ "train_loss": 0.0863784014862568,
8
+ "train_runtime": 2843.6931,
9
+ "train_samples_per_second": 7.174,
10
+ "train_steps_per_second": 0.299
11
+ }
llama2_13b_peft/abstract_narrative_understanding/eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 7.65,
3
+ "eval_loss": 0.0775449350476265,
4
+ "eval_runtime": 18.6993,
5
+ "eval_samples_per_second": 19.252,
6
+ "eval_steps_per_second": 0.802
7
+ }
llama2_13b_peft/abstract_narrative_understanding/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
llama2_13b_peft/abstract_narrative_understanding/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
llama2_13b_peft/abstract_narrative_understanding/tokenizer_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "LlamaTokenizer",
43
+ "unk_token": "<unk>",
44
+ "use_default_system_prompt": false
45
+ }
llama2_13b_peft/abstract_narrative_understanding/train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 7.65,
3
+ "train_loss": 0.0863784014862568,
4
+ "train_runtime": 2843.6931,
5
+ "train_samples_per_second": 7.174,
6
+ "train_steps_per_second": 0.299
7
+ }
llama2_13b_peft/abstract_narrative_understanding/trainer_log.jsonl ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 850, "loss": 2.3117, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5e-05, "epoch": 0.12, "percentage": 1.18, "elapsed_time": "0:00:40", "remaining_time": "0:57:16"}
2
+ {"current_steps": 20, "total_steps": 850, "loss": 1.1002, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001, "epoch": 0.24, "percentage": 2.35, "elapsed_time": "0:01:20", "remaining_time": "0:55:30"}
3
+ {"current_steps": 30, "total_steps": 850, "loss": 0.291, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.996418774081658e-05, "epoch": 0.35, "percentage": 3.53, "elapsed_time": "0:02:00", "remaining_time": "0:54:43"}
4
+ {"current_steps": 40, "total_steps": 850, "loss": 0.2298, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.985680226398261e-05, "epoch": 0.47, "percentage": 4.71, "elapsed_time": "0:02:39", "remaining_time": "0:53:53"}
5
+ {"current_steps": 50, "total_steps": 850, "loss": 0.1783, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.967799739815925e-05, "epoch": 0.59, "percentage": 5.88, "elapsed_time": "0:03:19", "remaining_time": "0:53:14"}
6
+ {"current_steps": 50, "total_steps": 850, "loss": null, "eval_loss": 0.15855231881141663, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.59, "percentage": 5.88, "elapsed_time": "0:03:19", "remaining_time": "0:53:14"}
7
+ {"current_steps": 60, "total_steps": 850, "loss": 0.1504, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.942802927959443e-05, "epoch": 0.71, "percentage": 7.06, "elapsed_time": "0:04:18", "remaining_time": "0:56:43"}
8
+ {"current_steps": 70, "total_steps": 850, "loss": 0.1438, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.910725598521013e-05, "epoch": 0.82, "percentage": 8.24, "elapsed_time": "0:04:58", "remaining_time": "0:55:26"}
9
+ {"current_steps": 80, "total_steps": 850, "loss": 0.126, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.871613701966067e-05, "epoch": 0.94, "percentage": 9.41, "elapsed_time": "0:05:38", "remaining_time": "0:54:17"}
10
+ {"current_steps": 90, "total_steps": 850, "loss": 0.0922, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.825523265709666e-05, "epoch": 1.06, "percentage": 10.59, "elapsed_time": "0:06:19", "remaining_time": "0:53:23"}
11
+ {"current_steps": 100, "total_steps": 850, "loss": 0.076, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.772520313857775e-05, "epoch": 1.18, "percentage": 11.76, "elapsed_time": "0:06:58", "remaining_time": "0:52:21"}
12
+ {"current_steps": 100, "total_steps": 850, "loss": null, "eval_loss": 0.11661680787801743, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.18, "percentage": 11.76, "elapsed_time": "0:06:58", "remaining_time": "0:52:21"}
13
+ {"current_steps": 110, "total_steps": 850, "loss": 0.0649, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.712680772628364e-05, "epoch": 1.29, "percentage": 12.94, "elapsed_time": "0:07:57", "remaining_time": "0:53:31"}
14
+ {"current_steps": 120, "total_steps": 850, "loss": 0.099, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.646090361587827e-05, "epoch": 1.41, "percentage": 14.12, "elapsed_time": "0:08:37", "remaining_time": "0:52:28"}
15
+ {"current_steps": 130, "total_steps": 850, "loss": 0.0877, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.572844470858537e-05, "epoch": 1.53, "percentage": 15.29, "elapsed_time": "0:09:17", "remaining_time": "0:51:27"}
16
+ {"current_steps": 140, "total_steps": 850, "loss": 0.0804, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.493048024473412e-05, "epoch": 1.65, "percentage": 16.47, "elapsed_time": "0:09:57", "remaining_time": "0:50:29"}
17
+ {"current_steps": 150, "total_steps": 850, "loss": 0.0666, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.406815330073244e-05, "epoch": 1.76, "percentage": 17.65, "elapsed_time": "0:10:37", "remaining_time": "0:49:33"}
18
+ {"current_steps": 150, "total_steps": 850, "loss": null, "eval_loss": 0.08253820985555649, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.76, "percentage": 17.65, "elapsed_time": "0:10:37", "remaining_time": "0:49:33"}
19
+ {"current_steps": 160, "total_steps": 850, "loss": 0.0862, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.314269915162114e-05, "epoch": 1.88, "percentage": 18.82, "elapsed_time": "0:11:36", "remaining_time": "0:50:01"}
20
+ {"current_steps": 170, "total_steps": 850, "loss": 0.063, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.215544350155422e-05, "epoch": 2.0, "percentage": 20.0, "elapsed_time": "0:12:15", "remaining_time": "0:49:03"}
21
+ {"current_steps": 180, "total_steps": 850, "loss": 0.0407, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.110780058474052e-05, "epoch": 2.12, "percentage": 21.18, "elapsed_time": "0:12:55", "remaining_time": "0:48:06"}
22
+ {"current_steps": 190, "total_steps": 850, "loss": 0.0328, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.000127113956674e-05, "epoch": 2.24, "percentage": 22.35, "elapsed_time": "0:13:35", "remaining_time": "0:47:12"}
23
+ {"current_steps": 200, "total_steps": 850, "loss": 0.0173, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.883744025880428e-05, "epoch": 2.35, "percentage": 23.53, "elapsed_time": "0:14:15", "remaining_time": "0:46:19"}
24
+ {"current_steps": 200, "total_steps": 850, "loss": null, "eval_loss": 0.07726477831602097, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.35, "percentage": 23.53, "elapsed_time": "0:14:15", "remaining_time": "0:46:19"}
25
+ {"current_steps": 210, "total_steps": 850, "loss": 0.0353, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.761797511897906e-05, "epoch": 2.47, "percentage": 24.71, "elapsed_time": "0:15:14", "remaining_time": "0:46:26"}
26
+ {"current_steps": 220, "total_steps": 850, "loss": 0.0265, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.634462259215719e-05, "epoch": 2.59, "percentage": 25.88, "elapsed_time": "0:15:54", "remaining_time": "0:45:32"}
27
+ {"current_steps": 230, "total_steps": 850, "loss": 0.0346, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.501920674356754e-05, "epoch": 2.71, "percentage": 27.06, "elapsed_time": "0:16:33", "remaining_time": "0:44:38"}
28
+ {"current_steps": 240, "total_steps": 850, "loss": 0.0228, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.364362621864595e-05, "epoch": 2.82, "percentage": 28.24, "elapsed_time": "0:17:13", "remaining_time": "0:43:46"}
29
+ {"current_steps": 250, "total_steps": 850, "loss": 0.0269, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.221985152324385e-05, "epoch": 2.94, "percentage": 29.41, "elapsed_time": "0:17:53", "remaining_time": "0:42:55"}
30
+ {"current_steps": 250, "total_steps": 850, "loss": null, "eval_loss": 0.08358720690011978, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.94, "percentage": 29.41, "elapsed_time": "0:17:53", "remaining_time": "0:42:55"}
31
+ {"current_steps": 260, "total_steps": 850, "loss": 0.0168, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.074992220089769e-05, "epoch": 3.06, "percentage": 30.59, "elapsed_time": "0:18:52", "remaining_time": "0:42:48"}
32
+ {"current_steps": 270, "total_steps": 850, "loss": 0.0164, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.923594391120236e-05, "epoch": 3.18, "percentage": 31.76, "elapsed_time": "0:19:31", "remaining_time": "0:41:56"}
33
+ {"current_steps": 280, "total_steps": 850, "loss": 0.0068, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.768008541347423e-05, "epoch": 3.29, "percentage": 32.94, "elapsed_time": "0:20:12", "remaining_time": "0:41:08"}
34
+ {"current_steps": 290, "total_steps": 850, "loss": 0.0141, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.608457546002424e-05, "epoch": 3.41, "percentage": 34.12, "elapsed_time": "0:20:52", "remaining_time": "0:40:19"}
35
+ {"current_steps": 300, "total_steps": 850, "loss": 0.0066, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.445169960349167e-05, "epoch": 3.53, "percentage": 35.29, "elapsed_time": "0:21:32", "remaining_time": "0:39:29"}
36
+ {"current_steps": 300, "total_steps": 850, "loss": null, "eval_loss": 0.06564046442508698, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 3.53, "percentage": 35.29, "elapsed_time": "0:21:32", "remaining_time": "0:39:29"}
37
+ {"current_steps": 310, "total_steps": 850, "loss": 0.0071, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.278379692281208e-05, "epoch": 3.65, "percentage": 36.47, "elapsed_time": "0:22:31", "remaining_time": "0:39:13"}
38
+ {"current_steps": 320, "total_steps": 850, "loss": 0.0065, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.10832566725092e-05, "epoch": 3.76, "percentage": 37.65, "elapsed_time": "0:23:10", "remaining_time": "0:38:23"}
39
+ {"current_steps": 330, "total_steps": 850, "loss": 0.005, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.935251486011087e-05, "epoch": 3.88, "percentage": 38.82, "elapsed_time": "0:23:50", "remaining_time": "0:37:34"}
40
+ {"current_steps": 340, "total_steps": 850, "loss": 0.0138, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.759405075659166e-05, "epoch": 4.0, "percentage": 40.0, "elapsed_time": "0:24:30", "remaining_time": "0:36:46"}
41
+ {"current_steps": 350, "total_steps": 850, "loss": 0.003, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.58103833448412e-05, "epoch": 4.12, "percentage": 41.18, "elapsed_time": "0:25:10", "remaining_time": "0:35:57"}
42
+ {"current_steps": 350, "total_steps": 850, "loss": null, "eval_loss": 0.06447144597768784, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 4.12, "percentage": 41.18, "elapsed_time": "0:25:10", "remaining_time": "0:35:57"}
43
+ {"current_steps": 360, "total_steps": 850, "loss": 0.0044, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.400406771124536e-05, "epoch": 4.24, "percentage": 42.35, "elapsed_time": "0:26:09", "remaining_time": "0:35:35"}
44
+ {"current_steps": 370, "total_steps": 850, "loss": 0.0045, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.21776913855496e-05, "epoch": 4.35, "percentage": 43.53, "elapsed_time": "0:26:48", "remaining_time": "0:34:47"}
45
+ {"current_steps": 380, "total_steps": 850, "loss": 0.0066, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.0333870634247645e-05, "epoch": 4.47, "percentage": 44.71, "elapsed_time": "0:27:29", "remaining_time": "0:33:59"}
46
+ {"current_steps": 390, "total_steps": 850, "loss": 0.0019, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.847524671280484e-05, "epoch": 4.59, "percentage": 45.88, "elapsed_time": "0:28:09", "remaining_time": "0:33:12"}
47
+ {"current_steps": 400, "total_steps": 850, "loss": 0.0027, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.660448208208513e-05, "epoch": 4.71, "percentage": 47.06, "elapsed_time": "0:28:49", "remaining_time": "0:32:25"}
48
+ {"current_steps": 400, "total_steps": 850, "loss": null, "eval_loss": 0.0728691965341568, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 4.71, "percentage": 47.06, "elapsed_time": "0:28:49", "remaining_time": "0:32:25"}
49
+ {"current_steps": 410, "total_steps": 850, "loss": 0.0054, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.472425659440157e-05, "epoch": 4.82, "percentage": 48.24, "elapsed_time": "0:29:47", "remaining_time": "0:31:58"}
50
+ {"current_steps": 420, "total_steps": 850, "loss": 0.0016, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.2837263654653715e-05, "epoch": 4.94, "percentage": 49.41, "elapsed_time": "0:30:27", "remaining_time": "0:31:10"}
51
+ {"current_steps": 430, "total_steps": 850, "loss": 0.0012, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.094620636205095e-05, "epoch": 5.06, "percentage": 50.59, "elapsed_time": "0:31:07", "remaining_time": "0:30:23"}
52
+ {"current_steps": 440, "total_steps": 850, "loss": 0.0002, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9053793637949067e-05, "epoch": 5.18, "percentage": 51.76, "elapsed_time": "0:31:47", "remaining_time": "0:29:37"}
53
+ {"current_steps": 450, "total_steps": 850, "loss": 0.0011, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7162736345346303e-05, "epoch": 5.29, "percentage": 52.94, "elapsed_time": "0:32:26", "remaining_time": "0:28:50"}
54
+ {"current_steps": 450, "total_steps": 850, "loss": null, "eval_loss": 0.06781358271837234, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 5.29, "percentage": 52.94, "elapsed_time": "0:32:26", "remaining_time": "0:28:50"}
55
+ {"current_steps": 460, "total_steps": 850, "loss": 0.0002, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.527574340559844e-05, "epoch": 5.41, "percentage": 54.12, "elapsed_time": "0:33:25", "remaining_time": "0:28:20"}
56
+ {"current_steps": 470, "total_steps": 850, "loss": 0.0002, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3395517917914895e-05, "epoch": 5.53, "percentage": 55.29, "elapsed_time": "0:34:05", "remaining_time": "0:27:33"}
57
+ {"current_steps": 480, "total_steps": 850, "loss": 0.0004, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1524753287195165e-05, "epoch": 5.65, "percentage": 56.47, "elapsed_time": "0:34:45", "remaining_time": "0:26:47"}
58
+ {"current_steps": 490, "total_steps": 850, "loss": 0.0004, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.966612936575235e-05, "epoch": 5.76, "percentage": 57.65, "elapsed_time": "0:35:25", "remaining_time": "0:26:01"}
59
+ {"current_steps": 500, "total_steps": 850, "loss": 0.0009, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7822308614450406e-05, "epoch": 5.88, "percentage": 58.82, "elapsed_time": "0:36:05", "remaining_time": "0:25:15"}
60
+ {"current_steps": 500, "total_steps": 850, "loss": null, "eval_loss": 0.0775449350476265, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 5.88, "percentage": 58.82, "elapsed_time": "0:36:05", "remaining_time": "0:25:15"}
61
+ {"current_steps": 510, "total_steps": 850, "loss": 0.0001, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.599593228875465e-05, "epoch": 6.0, "percentage": 60.0, "elapsed_time": "0:37:04", "remaining_time": "0:24:42"}
62
+ {"current_steps": 520, "total_steps": 850, "loss": 0.0001, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.41896166551588e-05, "epoch": 6.12, "percentage": 61.18, "elapsed_time": "0:37:44", "remaining_time": "0:23:56"}
63
+ {"current_steps": 530, "total_steps": 850, "loss": 0.0001, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.240594924340835e-05, "epoch": 6.24, "percentage": 62.35, "elapsed_time": "0:38:23", "remaining_time": "0:23:11"}
64
+ {"current_steps": 540, "total_steps": 850, "loss": 0.0001, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0647485139889145e-05, "epoch": 6.35, "percentage": 63.53, "elapsed_time": "0:39:03", "remaining_time": "0:22:25"}
65
+ {"current_steps": 550, "total_steps": 850, "loss": 0.0017, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8916743327490803e-05, "epoch": 6.47, "percentage": 64.71, "elapsed_time": "0:39:43", "remaining_time": "0:21:40"}
66
+ {"current_steps": 550, "total_steps": 850, "loss": null, "eval_loss": 0.07820568978786469, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 6.47, "percentage": 64.71, "elapsed_time": "0:39:43", "remaining_time": "0:21:40"}
67
+ {"current_steps": 560, "total_steps": 850, "loss": 0.0001, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.721620307718793e-05, "epoch": 6.59, "percentage": 65.88, "elapsed_time": "0:40:43", "remaining_time": "0:21:05"}
68
+ {"current_steps": 570, "total_steps": 850, "loss": 0.0001, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.554830039650834e-05, "epoch": 6.71, "percentage": 67.06, "elapsed_time": "0:41:23", "remaining_time": "0:20:19"}
69
+ {"current_steps": 580, "total_steps": 850, "loss": 0.0001, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.391542453997578e-05, "epoch": 6.82, "percentage": 68.24, "elapsed_time": "0:42:02", "remaining_time": "0:19:34"}
70
+ {"current_steps": 590, "total_steps": 850, "loss": 0.0, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2319914586525777e-05, "epoch": 6.94, "percentage": 69.41, "elapsed_time": "0:42:42", "remaining_time": "0:18:49"}
71
+ {"current_steps": 600, "total_steps": 850, "loss": 0.0001, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0764056088797645e-05, "epoch": 7.06, "percentage": 70.59, "elapsed_time": "0:43:21", "remaining_time": "0:18:04"}
72
+ {"current_steps": 600, "total_steps": 850, "loss": null, "eval_loss": 0.07926832884550095, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 7.06, "percentage": 70.59, "elapsed_time": "0:43:21", "remaining_time": "0:18:04"}
73
+ {"current_steps": 610, "total_steps": 850, "loss": 0.0, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9250077799102322e-05, "epoch": 7.18, "percentage": 71.76, "elapsed_time": "0:44:20", "remaining_time": "0:17:26"}
74
+ {"current_steps": 620, "total_steps": 850, "loss": 0.0, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7780148476756147e-05, "epoch": 7.29, "percentage": 72.94, "elapsed_time": "0:45:00", "remaining_time": "0:16:41"}
75
+ {"current_steps": 630, "total_steps": 850, "loss": 0.0001, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6356373781354058e-05, "epoch": 7.41, "percentage": 74.12, "elapsed_time": "0:45:40", "remaining_time": "0:15:57"}
76
+ {"current_steps": 640, "total_steps": 850, "loss": 0.0, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4980793256432474e-05, "epoch": 7.53, "percentage": 75.29, "elapsed_time": "0:46:20", "remaining_time": "0:15:12"}
77
+ {"current_steps": 650, "total_steps": 850, "loss": 0.0001, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3655377407842812e-05, "epoch": 7.65, "percentage": 76.47, "elapsed_time": "0:46:59", "remaining_time": "0:14:27"}
78
+ {"current_steps": 650, "total_steps": 850, "loss": null, "eval_loss": 0.08116021752357483, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 7.65, "percentage": 76.47, "elapsed_time": "0:46:59", "remaining_time": "0:14:27"}
79
+ {"current_steps": 650, "total_steps": 850, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 7.65, "percentage": 76.47, "elapsed_time": "0:46:59", "remaining_time": "0:14:27"}
80
+ {"current_steps": 15, "total_steps": 15, "loss": null, "eval_loss": 0.0775449350476265, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 7.65, "percentage": 100.0, "elapsed_time": "0:47:50", "remaining_time": "0:00:00"}
llama2_13b_peft/abstract_narrative_understanding/trainer_state.json ADDED
@@ -0,0 +1,589 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.0775449350476265,
3
+ "best_model_checkpoint": "ckpt/llama2_13b_fuze15_no_sys/abstract_narrative_understanding_no_sys/checkpoint-500",
4
+ "epoch": 7.647058823529412,
5
+ "eval_steps": 50,
6
+ "global_step": 650,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.12,
13
+ "grad_norm": 2.0217950344085693,
14
+ "learning_rate": 5e-05,
15
+ "loss": 2.3117,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.24,
20
+ "grad_norm": 0.8091562986373901,
21
+ "learning_rate": 0.0001,
22
+ "loss": 1.1002,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.35,
27
+ "grad_norm": 0.8186372518539429,
28
+ "learning_rate": 9.996418774081658e-05,
29
+ "loss": 0.291,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.47,
34
+ "grad_norm": 0.7440087199211121,
35
+ "learning_rate": 9.985680226398261e-05,
36
+ "loss": 0.2298,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.59,
41
+ "grad_norm": 0.86050945520401,
42
+ "learning_rate": 9.967799739815925e-05,
43
+ "loss": 0.1783,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.59,
48
+ "eval_loss": 0.15855231881141663,
49
+ "eval_runtime": 18.8402,
50
+ "eval_samples_per_second": 19.108,
51
+ "eval_steps_per_second": 0.796,
52
+ "step": 50
53
+ },
54
+ {
55
+ "epoch": 0.71,
56
+ "grad_norm": 1.0552955865859985,
57
+ "learning_rate": 9.942802927959443e-05,
58
+ "loss": 0.1504,
59
+ "step": 60
60
+ },
61
+ {
62
+ "epoch": 0.82,
63
+ "grad_norm": 1.1410902738571167,
64
+ "learning_rate": 9.910725598521013e-05,
65
+ "loss": 0.1438,
66
+ "step": 70
67
+ },
68
+ {
69
+ "epoch": 0.94,
70
+ "grad_norm": 0.8942976593971252,
71
+ "learning_rate": 9.871613701966067e-05,
72
+ "loss": 0.126,
73
+ "step": 80
74
+ },
75
+ {
76
+ "epoch": 1.06,
77
+ "grad_norm": 1.043587565422058,
78
+ "learning_rate": 9.825523265709666e-05,
79
+ "loss": 0.0922,
80
+ "step": 90
81
+ },
82
+ {
83
+ "epoch": 1.18,
84
+ "grad_norm": 0.6339874863624573,
85
+ "learning_rate": 9.772520313857775e-05,
86
+ "loss": 0.076,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 1.18,
91
+ "eval_loss": 0.11661680787801743,
92
+ "eval_runtime": 18.8492,
93
+ "eval_samples_per_second": 19.099,
94
+ "eval_steps_per_second": 0.796,
95
+ "step": 100
96
+ },
97
+ {
98
+ "epoch": 1.29,
99
+ "grad_norm": 1.4666110277175903,
100
+ "learning_rate": 9.712680772628364e-05,
101
+ "loss": 0.0649,
102
+ "step": 110
103
+ },
104
+ {
105
+ "epoch": 1.41,
106
+ "grad_norm": 0.6642812490463257,
107
+ "learning_rate": 9.646090361587827e-05,
108
+ "loss": 0.099,
109
+ "step": 120
110
+ },
111
+ {
112
+ "epoch": 1.53,
113
+ "grad_norm": 0.6947500705718994,
114
+ "learning_rate": 9.572844470858537e-05,
115
+ "loss": 0.0877,
116
+ "step": 130
117
+ },
118
+ {
119
+ "epoch": 1.65,
120
+ "grad_norm": 0.6531127691268921,
121
+ "learning_rate": 9.493048024473412e-05,
122
+ "loss": 0.0804,
123
+ "step": 140
124
+ },
125
+ {
126
+ "epoch": 1.76,
127
+ "grad_norm": 0.6246891021728516,
128
+ "learning_rate": 9.406815330073244e-05,
129
+ "loss": 0.0666,
130
+ "step": 150
131
+ },
132
+ {
133
+ "epoch": 1.76,
134
+ "eval_loss": 0.08253820985555649,
135
+ "eval_runtime": 18.8581,
136
+ "eval_samples_per_second": 19.09,
137
+ "eval_steps_per_second": 0.795,
138
+ "step": 150
139
+ },
140
+ {
141
+ "epoch": 1.88,
142
+ "grad_norm": 0.8410322666168213,
143
+ "learning_rate": 9.314269915162114e-05,
144
+ "loss": 0.0862,
145
+ "step": 160
146
+ },
147
+ {
148
+ "epoch": 2.0,
149
+ "grad_norm": 0.8153923153877258,
150
+ "learning_rate": 9.215544350155422e-05,
151
+ "loss": 0.063,
152
+ "step": 170
153
+ },
154
+ {
155
+ "epoch": 2.12,
156
+ "grad_norm": 0.4855154752731323,
157
+ "learning_rate": 9.110780058474052e-05,
158
+ "loss": 0.0407,
159
+ "step": 180
160
+ },
161
+ {
162
+ "epoch": 2.24,
163
+ "grad_norm": 0.3809996247291565,
164
+ "learning_rate": 9.000127113956674e-05,
165
+ "loss": 0.0328,
166
+ "step": 190
167
+ },
168
+ {
169
+ "epoch": 2.35,
170
+ "grad_norm": 0.4585777521133423,
171
+ "learning_rate": 8.883744025880428e-05,
172
+ "loss": 0.0173,
173
+ "step": 200
174
+ },
175
+ {
176
+ "epoch": 2.35,
177
+ "eval_loss": 0.07726477831602097,
178
+ "eval_runtime": 18.8583,
179
+ "eval_samples_per_second": 19.09,
180
+ "eval_steps_per_second": 0.795,
181
+ "step": 200
182
+ },
183
+ {
184
+ "epoch": 2.47,
185
+ "grad_norm": 1.3592562675476074,
186
+ "learning_rate": 8.761797511897906e-05,
187
+ "loss": 0.0353,
188
+ "step": 210
189
+ },
190
+ {
191
+ "epoch": 2.59,
192
+ "grad_norm": 0.32645800709724426,
193
+ "learning_rate": 8.634462259215719e-05,
194
+ "loss": 0.0265,
195
+ "step": 220
196
+ },
197
+ {
198
+ "epoch": 2.71,
199
+ "grad_norm": 0.7699334621429443,
200
+ "learning_rate": 8.501920674356754e-05,
201
+ "loss": 0.0346,
202
+ "step": 230
203
+ },
204
+ {
205
+ "epoch": 2.82,
206
+ "grad_norm": 0.7826615571975708,
207
+ "learning_rate": 8.364362621864595e-05,
208
+ "loss": 0.0228,
209
+ "step": 240
210
+ },
211
+ {
212
+ "epoch": 2.94,
213
+ "grad_norm": 1.523707389831543,
214
+ "learning_rate": 8.221985152324385e-05,
215
+ "loss": 0.0269,
216
+ "step": 250
217
+ },
218
+ {
219
+ "epoch": 2.94,
220
+ "eval_loss": 0.08358720690011978,
221
+ "eval_runtime": 18.8341,
222
+ "eval_samples_per_second": 19.114,
223
+ "eval_steps_per_second": 0.796,
224
+ "step": 250
225
+ },
226
+ {
227
+ "epoch": 3.06,
228
+ "grad_norm": 0.31683385372161865,
229
+ "learning_rate": 8.074992220089769e-05,
230
+ "loss": 0.0168,
231
+ "step": 260
232
+ },
233
+ {
234
+ "epoch": 3.18,
235
+ "grad_norm": 0.20376504957675934,
236
+ "learning_rate": 7.923594391120236e-05,
237
+ "loss": 0.0164,
238
+ "step": 270
239
+ },
240
+ {
241
+ "epoch": 3.29,
242
+ "grad_norm": 0.24734213948249817,
243
+ "learning_rate": 7.768008541347423e-05,
244
+ "loss": 0.0068,
245
+ "step": 280
246
+ },
247
+ {
248
+ "epoch": 3.41,
249
+ "grad_norm": 0.49086877703666687,
250
+ "learning_rate": 7.608457546002424e-05,
251
+ "loss": 0.0141,
252
+ "step": 290
253
+ },
254
+ {
255
+ "epoch": 3.53,
256
+ "grad_norm": 0.051008742302656174,
257
+ "learning_rate": 7.445169960349167e-05,
258
+ "loss": 0.0066,
259
+ "step": 300
260
+ },
261
+ {
262
+ "epoch": 3.53,
263
+ "eval_loss": 0.06564046442508698,
264
+ "eval_runtime": 18.8532,
265
+ "eval_samples_per_second": 19.095,
266
+ "eval_steps_per_second": 0.796,
267
+ "step": 300
268
+ },
269
+ {
270
+ "epoch": 3.65,
271
+ "grad_norm": 0.6045653223991394,
272
+ "learning_rate": 7.278379692281208e-05,
273
+ "loss": 0.0071,
274
+ "step": 310
275
+ },
276
+ {
277
+ "epoch": 3.76,
278
+ "grad_norm": 0.48832693696022034,
279
+ "learning_rate": 7.10832566725092e-05,
280
+ "loss": 0.0065,
281
+ "step": 320
282
+ },
283
+ {
284
+ "epoch": 3.88,
285
+ "grad_norm": 0.32412388920783997,
286
+ "learning_rate": 6.935251486011087e-05,
287
+ "loss": 0.005,
288
+ "step": 330
289
+ },
290
+ {
291
+ "epoch": 4.0,
292
+ "grad_norm": 2.07598614692688,
293
+ "learning_rate": 6.759405075659166e-05,
294
+ "loss": 0.0138,
295
+ "step": 340
296
+ },
297
+ {
298
+ "epoch": 4.12,
299
+ "grad_norm": 0.1396656185388565,
300
+ "learning_rate": 6.58103833448412e-05,
301
+ "loss": 0.003,
302
+ "step": 350
303
+ },
304
+ {
305
+ "epoch": 4.12,
306
+ "eval_loss": 0.06447144597768784,
307
+ "eval_runtime": 18.8594,
308
+ "eval_samples_per_second": 19.089,
309
+ "eval_steps_per_second": 0.795,
310
+ "step": 350
311
+ },
312
+ {
313
+ "epoch": 4.24,
314
+ "grad_norm": 0.07455724477767944,
315
+ "learning_rate": 6.400406771124536e-05,
316
+ "loss": 0.0044,
317
+ "step": 360
318
+ },
319
+ {
320
+ "epoch": 4.35,
321
+ "grad_norm": 0.2568245828151703,
322
+ "learning_rate": 6.21776913855496e-05,
323
+ "loss": 0.0045,
324
+ "step": 370
325
+ },
326
+ {
327
+ "epoch": 4.47,
328
+ "grad_norm": 0.5516663193702698,
329
+ "learning_rate": 6.0333870634247645e-05,
330
+ "loss": 0.0066,
331
+ "step": 380
332
+ },
333
+ {
334
+ "epoch": 4.59,
335
+ "grad_norm": 0.06458062678575516,
336
+ "learning_rate": 5.847524671280484e-05,
337
+ "loss": 0.0019,
338
+ "step": 390
339
+ },
340
+ {
341
+ "epoch": 4.71,
342
+ "grad_norm": 0.03421424329280853,
343
+ "learning_rate": 5.660448208208513e-05,
344
+ "loss": 0.0027,
345
+ "step": 400
346
+ },
347
+ {
348
+ "epoch": 4.71,
349
+ "eval_loss": 0.0728691965341568,
350
+ "eval_runtime": 18.8498,
351
+ "eval_samples_per_second": 19.098,
352
+ "eval_steps_per_second": 0.796,
353
+ "step": 400
354
+ },
355
+ {
356
+ "epoch": 4.82,
357
+ "grad_norm": 0.1939074546098709,
358
+ "learning_rate": 5.472425659440157e-05,
359
+ "loss": 0.0054,
360
+ "step": 410
361
+ },
362
+ {
363
+ "epoch": 4.94,
364
+ "grad_norm": 0.18776360154151917,
365
+ "learning_rate": 5.2837263654653715e-05,
366
+ "loss": 0.0016,
367
+ "step": 420
368
+ },
369
+ {
370
+ "epoch": 5.06,
371
+ "grad_norm": 0.011647732928395271,
372
+ "learning_rate": 5.094620636205095e-05,
373
+ "loss": 0.0012,
374
+ "step": 430
375
+ },
376
+ {
377
+ "epoch": 5.18,
378
+ "grad_norm": 0.010027878917753696,
379
+ "learning_rate": 4.9053793637949067e-05,
380
+ "loss": 0.0002,
381
+ "step": 440
382
+ },
383
+ {
384
+ "epoch": 5.29,
385
+ "grad_norm": 0.02423482947051525,
386
+ "learning_rate": 4.7162736345346303e-05,
387
+ "loss": 0.0011,
388
+ "step": 450
389
+ },
390
+ {
391
+ "epoch": 5.29,
392
+ "eval_loss": 0.06781358271837234,
393
+ "eval_runtime": 18.8438,
394
+ "eval_samples_per_second": 19.104,
395
+ "eval_steps_per_second": 0.796,
396
+ "step": 450
397
+ },
398
+ {
399
+ "epoch": 5.41,
400
+ "grad_norm": 0.0043674008920788765,
401
+ "learning_rate": 4.527574340559844e-05,
402
+ "loss": 0.0002,
403
+ "step": 460
404
+ },
405
+ {
406
+ "epoch": 5.53,
407
+ "grad_norm": 0.013400307856500149,
408
+ "learning_rate": 4.3395517917914895e-05,
409
+ "loss": 0.0002,
410
+ "step": 470
411
+ },
412
+ {
413
+ "epoch": 5.65,
414
+ "grad_norm": 0.02295060083270073,
415
+ "learning_rate": 4.1524753287195165e-05,
416
+ "loss": 0.0004,
417
+ "step": 480
418
+ },
419
+ {
420
+ "epoch": 5.76,
421
+ "grad_norm": 0.027492625638842583,
422
+ "learning_rate": 3.966612936575235e-05,
423
+ "loss": 0.0004,
424
+ "step": 490
425
+ },
426
+ {
427
+ "epoch": 5.88,
428
+ "grad_norm": 0.2364090532064438,
429
+ "learning_rate": 3.7822308614450406e-05,
430
+ "loss": 0.0009,
431
+ "step": 500
432
+ },
433
+ {
434
+ "epoch": 5.88,
435
+ "eval_loss": 0.0775449350476265,
436
+ "eval_runtime": 18.8549,
437
+ "eval_samples_per_second": 19.093,
438
+ "eval_steps_per_second": 0.796,
439
+ "step": 500
440
+ },
441
+ {
442
+ "epoch": 6.0,
443
+ "grad_norm": 0.006466879975050688,
444
+ "learning_rate": 3.599593228875465e-05,
445
+ "loss": 0.0001,
446
+ "step": 510
447
+ },
448
+ {
449
+ "epoch": 6.12,
450
+ "grad_norm": 0.0022094016894698143,
451
+ "learning_rate": 3.41896166551588e-05,
452
+ "loss": 0.0001,
453
+ "step": 520
454
+ },
455
+ {
456
+ "epoch": 6.24,
457
+ "grad_norm": 0.0025171618908643723,
458
+ "learning_rate": 3.240594924340835e-05,
459
+ "loss": 0.0001,
460
+ "step": 530
461
+ },
462
+ {
463
+ "epoch": 6.35,
464
+ "grad_norm": 0.003617402631789446,
465
+ "learning_rate": 3.0647485139889145e-05,
466
+ "loss": 0.0001,
467
+ "step": 540
468
+ },
469
+ {
470
+ "epoch": 6.47,
471
+ "grad_norm": 0.00444138515740633,
472
+ "learning_rate": 2.8916743327490803e-05,
473
+ "loss": 0.0017,
474
+ "step": 550
475
+ },
476
+ {
477
+ "epoch": 6.47,
478
+ "eval_loss": 0.07820568978786469,
479
+ "eval_runtime": 18.8263,
480
+ "eval_samples_per_second": 19.122,
481
+ "eval_steps_per_second": 0.797,
482
+ "step": 550
483
+ },
484
+ {
485
+ "epoch": 6.59,
486
+ "grad_norm": 0.04611958935856819,
487
+ "learning_rate": 2.721620307718793e-05,
488
+ "loss": 0.0001,
489
+ "step": 560
490
+ },
491
+ {
492
+ "epoch": 6.71,
493
+ "grad_norm": 0.005136082414537668,
494
+ "learning_rate": 2.554830039650834e-05,
495
+ "loss": 0.0001,
496
+ "step": 570
497
+ },
498
+ {
499
+ "epoch": 6.82,
500
+ "grad_norm": 0.0014651508536189795,
501
+ "learning_rate": 2.391542453997578e-05,
502
+ "loss": 0.0001,
503
+ "step": 580
504
+ },
505
+ {
506
+ "epoch": 6.94,
507
+ "grad_norm": 0.0029225933831185102,
508
+ "learning_rate": 2.2319914586525777e-05,
509
+ "loss": 0.0,
510
+ "step": 590
511
+ },
512
+ {
513
+ "epoch": 7.06,
514
+ "grad_norm": 0.010204533115029335,
515
+ "learning_rate": 2.0764056088797645e-05,
516
+ "loss": 0.0001,
517
+ "step": 600
518
+ },
519
+ {
520
+ "epoch": 7.06,
521
+ "eval_loss": 0.07926832884550095,
522
+ "eval_runtime": 18.8273,
523
+ "eval_samples_per_second": 19.121,
524
+ "eval_steps_per_second": 0.797,
525
+ "step": 600
526
+ },
527
+ {
528
+ "epoch": 7.18,
529
+ "grad_norm": 0.0016622378025203943,
530
+ "learning_rate": 1.9250077799102322e-05,
531
+ "loss": 0.0,
532
+ "step": 610
533
+ },
534
+ {
535
+ "epoch": 7.29,
536
+ "grad_norm": 0.0054710181429982185,
537
+ "learning_rate": 1.7780148476756147e-05,
538
+ "loss": 0.0,
539
+ "step": 620
540
+ },
541
+ {
542
+ "epoch": 7.41,
543
+ "grad_norm": 0.003711065510287881,
544
+ "learning_rate": 1.6356373781354058e-05,
545
+ "loss": 0.0001,
546
+ "step": 630
547
+ },
548
+ {
549
+ "epoch": 7.53,
550
+ "grad_norm": 0.0011153242085129023,
551
+ "learning_rate": 1.4980793256432474e-05,
552
+ "loss": 0.0,
553
+ "step": 640
554
+ },
555
+ {
556
+ "epoch": 7.65,
557
+ "grad_norm": 0.014780309051275253,
558
+ "learning_rate": 1.3655377407842812e-05,
559
+ "loss": 0.0001,
560
+ "step": 650
561
+ },
562
+ {
563
+ "epoch": 7.65,
564
+ "eval_loss": 0.08116021752357483,
565
+ "eval_runtime": 18.8377,
566
+ "eval_samples_per_second": 19.111,
567
+ "eval_steps_per_second": 0.796,
568
+ "step": 650
569
+ },
570
+ {
571
+ "epoch": 7.65,
572
+ "step": 650,
573
+ "total_flos": 1.541550606773125e+18,
574
+ "train_loss": 0.0863784014862568,
575
+ "train_runtime": 2843.6931,
576
+ "train_samples_per_second": 7.174,
577
+ "train_steps_per_second": 0.299
578
+ }
579
+ ],
580
+ "logging_steps": 10,
581
+ "max_steps": 850,
582
+ "num_input_tokens_seen": 0,
583
+ "num_train_epochs": 10,
584
+ "save_steps": 500,
585
+ "total_flos": 1.541550606773125e+18,
586
+ "train_batch_size": 8,
587
+ "trial_name": null,
588
+ "trial_params": null
589
+ }
llama2_13b_peft/abstract_narrative_understanding/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c13f2efe6e7a8ca780611c8deeaba1a047aa3f9ec1b761c9f8aba21759301a2
3
+ size 5112
llama2_13b_peft/abstract_narrative_understanding/training_eval_loss.png ADDED
llama2_13b_peft/abstract_narrative_understanding/training_loss.png ADDED
llama2_13b_peft/alpaca/README.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - llama-factory
6
+ - lora
7
+ - generated_from_trainer
8
+ base_model: /data1/model/llama2/meta-llama/Llama2-13b
9
+ model-index:
10
+ - name: alpaca_no_sys
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # alpaca_no_sys
18
+
19
+ This model is a fine-tuned version of [/data1/model/llama2/meta-llama/Llama2-13b](https://huggingface.co//data1/model/llama2/meta-llama/Llama2-13b) on the alpaca_no_sys dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 1.0082
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 0.0001
41
+ - train_batch_size: 8
42
+ - eval_batch_size: 8
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 2
46
+ - total_train_batch_size: 16
47
+ - total_eval_batch_size: 16
48
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
+ - lr_scheduler_type: cosine
50
+ - lr_scheduler_warmup_steps: 20
51
+ - num_epochs: 10.0
52
+
53
+ ### Training results
54
+
55
+ | Training Loss | Epoch | Step | Validation Loss |
56
+ |:-------------:|:-----:|:----:|:---------------:|
57
+ | 1.0429 | 0.07 | 200 | 1.0298 |
58
+ | 1.0505 | 0.14 | 400 | 1.0238 |
59
+ | 1.044 | 0.22 | 600 | 1.0194 |
60
+ | 1.0169 | 0.29 | 800 | 1.0172 |
61
+ | 1.02 | 0.36 | 1000 | 1.0154 |
62
+ | 0.9492 | 0.43 | 1200 | 1.0134 |
63
+ | 1.0051 | 0.51 | 1400 | 1.0117 |
64
+ | 1.0469 | 0.58 | 1600 | 1.0106 |
65
+ | 0.9994 | 0.65 | 1800 | 1.0094 |
66
+ | 1.0141 | 0.72 | 2000 | 1.0082 |
67
+ | 1.0891 | 0.8 | 2200 | 1.0073 |
68
+ | 1.0141 | 0.87 | 2400 | 1.0063 |
69
+ | 1.0002 | 0.94 | 2600 | 1.0059 |
70
+ | 0.9686 | 1.01 | 2800 | 1.0086 |
71
+ | 0.9767 | 1.09 | 3000 | 1.0141 |
72
+ | 0.9494 | 1.16 | 3200 | 1.0160 |
73
+
74
+
75
+ ### Framework versions
76
+
77
+ - PEFT 0.9.0
78
+ - Transformers 4.38.2
79
+ - Pytorch 2.2.1
80
+ - Datasets 2.18.0
81
+ - Tokenizers 0.15.2
llama2_13b_peft/alpaca/adapter_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/data1/model/llama2/meta-llama/Llama2-13b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.0,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 8,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "o_proj",
23
+ "gate_proj",
24
+ "v_proj",
25
+ "down_proj",
26
+ "up_proj",
27
+ "q_proj",
28
+ "k_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "use_dora": false,
32
+ "use_rslora": false
33
+ }
llama2_13b_peft/alpaca/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89face3e5e4bb009020e5742eaa4fa7c830124d15007f50b345d18c3c0daf8df
3
+ size 125248064
llama2_13b_peft/alpaca/all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.16,
3
+ "eval_loss": 1.0082145929336548,
4
+ "eval_runtime": 124.7969,
5
+ "eval_samples_per_second": 62.51,
6
+ "eval_steps_per_second": 3.91,
7
+ "train_loss": 1.015018144249916,
8
+ "train_runtime": 4695.8401,
9
+ "train_samples_per_second": 94.128,
10
+ "train_steps_per_second": 5.884
11
+ }
llama2_13b_peft/alpaca/eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.16,
3
+ "eval_loss": 1.0082145929336548,
4
+ "eval_runtime": 124.7969,
5
+ "eval_samples_per_second": 62.51,
6
+ "eval_steps_per_second": 3.91
7
+ }
llama2_13b_peft/alpaca/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
llama2_13b_peft/alpaca/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
llama2_13b_peft/alpaca/tokenizer_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "LlamaTokenizer",
43
+ "unk_token": "<unk>",
44
+ "use_default_system_prompt": false
45
+ }
llama2_13b_peft/alpaca/train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.16,
3
+ "train_loss": 1.015018144249916,
4
+ "train_runtime": 4695.8401,
5
+ "train_samples_per_second": 94.128,
6
+ "train_steps_per_second": 5.884
7
+ }
llama2_13b_peft/alpaca/trainer_log.jsonl ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 27630, "loss": 1.4369, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5e-05, "epoch": 0.0, "percentage": 0.04, "elapsed_time": "0:00:10", "remaining_time": "8:15:16"}
2
+ {"current_steps": 20, "total_steps": 27630, "loss": 1.3624, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001, "epoch": 0.01, "percentage": 0.07, "elapsed_time": "0:00:19", "remaining_time": "7:33:01"}
3
+ {"current_steps": 30, "total_steps": 27630, "loss": 1.1589, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999996763266864e-05, "epoch": 0.01, "percentage": 0.11, "elapsed_time": "0:00:28", "remaining_time": "7:16:26"}
4
+ {"current_steps": 40, "total_steps": 27630, "loss": 1.1103, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999987053071647e-05, "epoch": 0.01, "percentage": 0.14, "elapsed_time": "0:00:35", "remaining_time": "6:42:44"}
5
+ {"current_steps": 50, "total_steps": 27630, "loss": 1.0601, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.99997086942692e-05, "epoch": 0.02, "percentage": 0.18, "elapsed_time": "0:00:43", "remaining_time": "6:37:46"}
6
+ {"current_steps": 60, "total_steps": 27630, "loss": 1.0302, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999948212353635e-05, "epoch": 0.02, "percentage": 0.22, "elapsed_time": "0:00:50", "remaining_time": "6:30:17"}
7
+ {"current_steps": 70, "total_steps": 27630, "loss": 1.114, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999919081881129e-05, "epoch": 0.03, "percentage": 0.25, "elapsed_time": "0:00:59", "remaining_time": "6:33:10"}
8
+ {"current_steps": 80, "total_steps": 27630, "loss": 1.0913, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999883478047113e-05, "epoch": 0.03, "percentage": 0.29, "elapsed_time": "0:01:09", "remaining_time": "6:36:03"}
9
+ {"current_steps": 90, "total_steps": 27630, "loss": 1.0778, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999841400897687e-05, "epoch": 0.03, "percentage": 0.33, "elapsed_time": "0:01:18", "remaining_time": "6:38:18"}
10
+ {"current_steps": 100, "total_steps": 27630, "loss": 1.0493, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999792850487325e-05, "epoch": 0.04, "percentage": 0.36, "elapsed_time": "0:01:25", "remaining_time": "6:32:31"}
11
+ {"current_steps": 110, "total_steps": 27630, "loss": 1.0249, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999737826878886e-05, "epoch": 0.04, "percentage": 0.4, "elapsed_time": "0:01:33", "remaining_time": "6:30:08"}
12
+ {"current_steps": 120, "total_steps": 27630, "loss": 1.0594, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.99967633014361e-05, "epoch": 0.04, "percentage": 0.43, "elapsed_time": "0:01:41", "remaining_time": "6:28:19"}
13
+ {"current_steps": 130, "total_steps": 27630, "loss": 1.0527, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999608360361113e-05, "epoch": 0.05, "percentage": 0.47, "elapsed_time": "0:01:49", "remaining_time": "6:27:25"}
14
+ {"current_steps": 140, "total_steps": 27630, "loss": 1.0051, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999533917619399e-05, "epoch": 0.05, "percentage": 0.51, "elapsed_time": "0:01:59", "remaining_time": "6:29:40"}
15
+ {"current_steps": 150, "total_steps": 27630, "loss": 0.9906, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999453002014846e-05, "epoch": 0.05, "percentage": 0.54, "elapsed_time": "0:02:06", "remaining_time": "6:25:18"}
16
+ {"current_steps": 160, "total_steps": 27630, "loss": 1.0197, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999365613652217e-05, "epoch": 0.06, "percentage": 0.58, "elapsed_time": "0:02:13", "remaining_time": "6:20:51"}
17
+ {"current_steps": 170, "total_steps": 27630, "loss": 1.0356, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999271752644649e-05, "epoch": 0.06, "percentage": 0.62, "elapsed_time": "0:02:23", "remaining_time": "6:26:18"}
18
+ {"current_steps": 180, "total_steps": 27630, "loss": 1.0332, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999171419113666e-05, "epoch": 0.07, "percentage": 0.65, "elapsed_time": "0:02:30", "remaining_time": "6:23:32"}
19
+ {"current_steps": 190, "total_steps": 27630, "loss": 1.0126, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999064613189171e-05, "epoch": 0.07, "percentage": 0.69, "elapsed_time": "0:02:38", "remaining_time": "6:20:45"}
20
+ {"current_steps": 200, "total_steps": 27630, "loss": 1.0429, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.998951335009442e-05, "epoch": 0.07, "percentage": 0.72, "elapsed_time": "0:02:46", "remaining_time": "6:20:16"}
21
+ {"current_steps": 200, "total_steps": 27630, "loss": null, "eval_loss": 1.029819130897522, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.07, "percentage": 0.72, "elapsed_time": "0:02:46", "remaining_time": "6:20:16"}
22
+ {"current_steps": 210, "total_steps": 27630, "loss": 1.0431, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.998831584721141e-05, "epoch": 0.08, "percentage": 0.76, "elapsed_time": "0:04:59", "remaining_time": "10:51:19"}
23
+ {"current_steps": 220, "total_steps": 27630, "loss": 1.0374, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.998705362479307e-05, "epoch": 0.08, "percentage": 0.8, "elapsed_time": "0:05:09", "remaining_time": "10:42:39"}
24
+ {"current_steps": 230, "total_steps": 27630, "loss": 1.1065, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.99857266844736e-05, "epoch": 0.08, "percentage": 0.83, "elapsed_time": "0:05:17", "remaining_time": "10:30:52"}
25
+ {"current_steps": 240, "total_steps": 27630, "loss": 1.1105, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.998433502797095e-05, "epoch": 0.09, "percentage": 0.87, "elapsed_time": "0:05:24", "remaining_time": "10:17:51"}
26
+ {"current_steps": 250, "total_steps": 27630, "loss": 0.9839, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.998287865708694e-05, "epoch": 0.09, "percentage": 0.9, "elapsed_time": "0:05:32", "remaining_time": "10:06:47"}
27
+ {"current_steps": 260, "total_steps": 27630, "loss": 1.0401, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.998135757370708e-05, "epoch": 0.09, "percentage": 0.94, "elapsed_time": "0:05:40", "remaining_time": "9:56:33"}
28
+ {"current_steps": 270, "total_steps": 27630, "loss": 1.0461, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.997977177980074e-05, "epoch": 0.1, "percentage": 0.98, "elapsed_time": "0:05:48", "remaining_time": "9:49:11"}
29
+ {"current_steps": 280, "total_steps": 27630, "loss": 1.0662, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.9978121277421e-05, "epoch": 0.1, "percentage": 1.01, "elapsed_time": "0:05:57", "remaining_time": "9:42:19"}
30
+ {"current_steps": 290, "total_steps": 27630, "loss": 1.0736, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.99764060687048e-05, "epoch": 0.1, "percentage": 1.05, "elapsed_time": "0:06:06", "remaining_time": "9:35:51"}
31
+ {"current_steps": 300, "total_steps": 27630, "loss": 0.9963, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.997462615587276e-05, "epoch": 0.11, "percentage": 1.09, "elapsed_time": "0:06:14", "remaining_time": "9:28:28"}
32
+ {"current_steps": 310, "total_steps": 27630, "loss": 1.044, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.997278154122935e-05, "epoch": 0.11, "percentage": 1.12, "elapsed_time": "0:06:23", "remaining_time": "9:23:40"}
33
+ {"current_steps": 320, "total_steps": 27630, "loss": 1.0713, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.997087222716278e-05, "epoch": 0.12, "percentage": 1.16, "elapsed_time": "0:06:31", "remaining_time": "9:17:11"}
34
+ {"current_steps": 330, "total_steps": 27630, "loss": 1.0721, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.996889821614502e-05, "epoch": 0.12, "percentage": 1.19, "elapsed_time": "0:06:40", "remaining_time": "9:11:37"}
35
+ {"current_steps": 340, "total_steps": 27630, "loss": 1.0414, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.996685951073182e-05, "epoch": 0.12, "percentage": 1.23, "elapsed_time": "0:06:48", "remaining_time": "9:05:51"}
36
+ {"current_steps": 350, "total_steps": 27630, "loss": 0.9856, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.996475611356264e-05, "epoch": 0.13, "percentage": 1.27, "elapsed_time": "0:06:55", "remaining_time": "9:00:01"}
37
+ {"current_steps": 360, "total_steps": 27630, "loss": 1.0121, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.996258802736079e-05, "epoch": 0.13, "percentage": 1.3, "elapsed_time": "0:07:03", "remaining_time": "8:55:02"}
38
+ {"current_steps": 370, "total_steps": 27630, "loss": 1.0785, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.996035525493322e-05, "epoch": 0.13, "percentage": 1.34, "elapsed_time": "0:07:12", "remaining_time": "8:50:45"}
39
+ {"current_steps": 380, "total_steps": 27630, "loss": 0.996, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.995805779917073e-05, "epoch": 0.14, "percentage": 1.38, "elapsed_time": "0:07:20", "remaining_time": "8:46:51"}
40
+ {"current_steps": 390, "total_steps": 27630, "loss": 1.0557, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.99556956630478e-05, "epoch": 0.14, "percentage": 1.41, "elapsed_time": "0:07:29", "remaining_time": "8:43:28"}
41
+ {"current_steps": 400, "total_steps": 27630, "loss": 1.0505, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.995326884962268e-05, "epoch": 0.14, "percentage": 1.45, "elapsed_time": "0:07:37", "remaining_time": "8:38:37"}
42
+ {"current_steps": 400, "total_steps": 27630, "loss": null, "eval_loss": 1.023820400238037, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.14, "percentage": 1.45, "elapsed_time": "0:07:37", "remaining_time": "8:38:37"}
43
+ {"current_steps": 410, "total_steps": 27630, "loss": 0.9919, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.995077736203733e-05, "epoch": 0.15, "percentage": 1.48, "elapsed_time": "0:09:50", "remaining_time": "10:53:45"}
44
+ {"current_steps": 420, "total_steps": 27630, "loss": 1.0736, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.99482212035175e-05, "epoch": 0.15, "percentage": 1.52, "elapsed_time": "0:10:00", "remaining_time": "10:48:23"}
45
+ {"current_steps": 430, "total_steps": 27630, "loss": 1.0633, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.994560037737259e-05, "epoch": 0.16, "percentage": 1.56, "elapsed_time": "0:10:08", "remaining_time": "10:41:21"}
46
+ {"current_steps": 440, "total_steps": 27630, "loss": 1.049, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.994291488699579e-05, "epoch": 0.16, "percentage": 1.59, "elapsed_time": "0:10:15", "remaining_time": "10:34:13"}
47
+ {"current_steps": 450, "total_steps": 27630, "loss": 1.0022, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.994016473586398e-05, "epoch": 0.16, "percentage": 1.63, "elapsed_time": "0:10:23", "remaining_time": "10:28:02"}
48
+ {"current_steps": 460, "total_steps": 27630, "loss": 1.0076, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.993734992753777e-05, "epoch": 0.17, "percentage": 1.66, "elapsed_time": "0:10:34", "remaining_time": "10:25:03"}
49
+ {"current_steps": 470, "total_steps": 27630, "loss": 1.0298, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.993447046566146e-05, "epoch": 0.17, "percentage": 1.7, "elapsed_time": "0:10:44", "remaining_time": "10:20:26"}
50
+ {"current_steps": 480, "total_steps": 27630, "loss": 1.0635, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.993152635396308e-05, "epoch": 0.17, "percentage": 1.74, "elapsed_time": "0:10:52", "remaining_time": "10:14:51"}
51
+ {"current_steps": 490, "total_steps": 27630, "loss": 1.0183, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.992851759625433e-05, "epoch": 0.18, "percentage": 1.77, "elapsed_time": "0:11:00", "remaining_time": "10:09:51"}
52
+ {"current_steps": 500, "total_steps": 27630, "loss": 0.963, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.992544419643066e-05, "epoch": 0.18, "percentage": 1.81, "elapsed_time": "0:11:08", "remaining_time": "10:04:25"}
53
+ {"current_steps": 510, "total_steps": 27630, "loss": 0.9691, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.992230615847116e-05, "epoch": 0.18, "percentage": 1.85, "elapsed_time": "0:11:16", "remaining_time": "9:59:54"}
54
+ {"current_steps": 520, "total_steps": 27630, "loss": 1.0309, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.991910348643865e-05, "epoch": 0.19, "percentage": 1.88, "elapsed_time": "0:11:25", "remaining_time": "9:55:38"}
55
+ {"current_steps": 530, "total_steps": 27630, "loss": 1.0232, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.991583618447958e-05, "epoch": 0.19, "percentage": 1.92, "elapsed_time": "0:11:34", "remaining_time": "9:51:28"}
56
+ {"current_steps": 540, "total_steps": 27630, "loss": 1.0308, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.99125042568241e-05, "epoch": 0.2, "percentage": 1.95, "elapsed_time": "0:11:41", "remaining_time": "9:46:47"}
57
+ {"current_steps": 550, "total_steps": 27630, "loss": 1.0581, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.990910770778606e-05, "epoch": 0.2, "percentage": 1.99, "elapsed_time": "0:11:50", "remaining_time": "9:42:50"}
58
+ {"current_steps": 560, "total_steps": 27630, "loss": 0.958, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.990564654176293e-05, "epoch": 0.2, "percentage": 2.03, "elapsed_time": "0:11:58", "remaining_time": "9:38:52"}
59
+ {"current_steps": 570, "total_steps": 27630, "loss": 1.0258, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.990212076323586e-05, "epoch": 0.21, "percentage": 2.06, "elapsed_time": "0:12:07", "remaining_time": "9:35:27"}
60
+ {"current_steps": 580, "total_steps": 27630, "loss": 1.0724, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.989853037676965e-05, "epoch": 0.21, "percentage": 2.1, "elapsed_time": "0:12:15", "remaining_time": "9:31:37"}
61
+ {"current_steps": 590, "total_steps": 27630, "loss": 0.9847, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.989487538701279e-05, "epoch": 0.21, "percentage": 2.14, "elapsed_time": "0:12:23", "remaining_time": "9:28:02"}
62
+ {"current_steps": 600, "total_steps": 27630, "loss": 1.044, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.989115579869732e-05, "epoch": 0.22, "percentage": 2.17, "elapsed_time": "0:12:31", "remaining_time": "9:24:36"}
63
+ {"current_steps": 600, "total_steps": 27630, "loss": null, "eval_loss": 1.0194298028945923, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.22, "percentage": 2.17, "elapsed_time": "0:12:31", "remaining_time": "9:24:36"}
64
+ {"current_steps": 610, "total_steps": 27630, "loss": 1.0244, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.988737161663898e-05, "epoch": 0.22, "percentage": 2.21, "elapsed_time": "0:14:45", "remaining_time": "10:53:34"}
65
+ {"current_steps": 620, "total_steps": 27630, "loss": 1.0254, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.988352284573713e-05, "epoch": 0.22, "percentage": 2.24, "elapsed_time": "0:14:54", "remaining_time": "10:49:13"}
66
+ {"current_steps": 630, "total_steps": 27630, "loss": 1.1093, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.987960949097475e-05, "epoch": 0.23, "percentage": 2.28, "elapsed_time": "0:15:02", "remaining_time": "10:44:31"}
67
+ {"current_steps": 640, "total_steps": 27630, "loss": 1.0196, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.987563155741842e-05, "epoch": 0.23, "percentage": 2.32, "elapsed_time": "0:15:09", "remaining_time": "10:39:31"}
68
+ {"current_steps": 650, "total_steps": 27630, "loss": 1.012, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.987158905021836e-05, "epoch": 0.24, "percentage": 2.35, "elapsed_time": "0:15:17", "remaining_time": "10:34:35"}
69
+ {"current_steps": 660, "total_steps": 27630, "loss": 1.0219, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.986748197460837e-05, "epoch": 0.24, "percentage": 2.39, "elapsed_time": "0:15:24", "remaining_time": "10:29:55"}
70
+ {"current_steps": 670, "total_steps": 27630, "loss": 1.015, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.986331033590586e-05, "epoch": 0.24, "percentage": 2.42, "elapsed_time": "0:15:33", "remaining_time": "10:26:13"}
71
+ {"current_steps": 680, "total_steps": 27630, "loss": 1.1113, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.98590741395118e-05, "epoch": 0.25, "percentage": 2.46, "elapsed_time": "0:15:40", "remaining_time": "10:21:24"}
72
+ {"current_steps": 690, "total_steps": 27630, "loss": 1.0456, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.985477339091078e-05, "epoch": 0.25, "percentage": 2.5, "elapsed_time": "0:15:49", "remaining_time": "10:17:54"}
73
+ {"current_steps": 700, "total_steps": 27630, "loss": 1.0102, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.985040809567097e-05, "epoch": 0.25, "percentage": 2.53, "elapsed_time": "0:15:57", "remaining_time": "10:14:15"}
74
+ {"current_steps": 710, "total_steps": 27630, "loss": 1.0057, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.984597825944405e-05, "epoch": 0.26, "percentage": 2.57, "elapsed_time": "0:16:04", "remaining_time": "10:09:39"}
75
+ {"current_steps": 720, "total_steps": 27630, "loss": 0.9937, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.984148388796532e-05, "epoch": 0.26, "percentage": 2.61, "elapsed_time": "0:16:15", "remaining_time": "10:07:25"}
76
+ {"current_steps": 730, "total_steps": 27630, "loss": 0.9937, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.983692498705361e-05, "epoch": 0.26, "percentage": 2.64, "elapsed_time": "0:16:24", "remaining_time": "10:04:25"}
77
+ {"current_steps": 740, "total_steps": 27630, "loss": 1.0205, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.983230156261132e-05, "epoch": 0.27, "percentage": 2.68, "elapsed_time": "0:16:32", "remaining_time": "10:00:48"}
78
+ {"current_steps": 750, "total_steps": 27630, "loss": 1.0486, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.982761362062432e-05, "epoch": 0.27, "percentage": 2.71, "elapsed_time": "0:16:40", "remaining_time": "9:57:30"}
79
+ {"current_steps": 760, "total_steps": 27630, "loss": 1.0679, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.982286116716208e-05, "epoch": 0.28, "percentage": 2.75, "elapsed_time": "0:16:50", "remaining_time": "9:55:18"}
80
+ {"current_steps": 770, "total_steps": 27630, "loss": 1.0051, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.98180442083776e-05, "epoch": 0.28, "percentage": 2.79, "elapsed_time": "0:16:57", "remaining_time": "9:51:45"}
81
+ {"current_steps": 780, "total_steps": 27630, "loss": 1.0398, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.981316275050731e-05, "epoch": 0.28, "percentage": 2.82, "elapsed_time": "0:17:05", "remaining_time": "9:48:32"}
82
+ {"current_steps": 790, "total_steps": 27630, "loss": 1.0365, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.980821679987125e-05, "epoch": 0.29, "percentage": 2.86, "elapsed_time": "0:17:13", "remaining_time": "9:45:26"}
83
+ {"current_steps": 800, "total_steps": 27630, "loss": 1.0169, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.980320636287285e-05, "epoch": 0.29, "percentage": 2.9, "elapsed_time": "0:17:22", "remaining_time": "9:42:45"}
84
+ {"current_steps": 800, "total_steps": 27630, "loss": null, "eval_loss": 1.0172123908996582, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.29, "percentage": 2.9, "elapsed_time": "0:17:22", "remaining_time": "9:42:45"}
85
+ {"current_steps": 810, "total_steps": 27630, "loss": 1.0165, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.979813144599915e-05, "epoch": 0.29, "percentage": 2.93, "elapsed_time": "0:19:34", "remaining_time": "10:48:16"}
86
+ {"current_steps": 820, "total_steps": 27630, "loss": 1.0314, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.979299205582057e-05, "epoch": 0.3, "percentage": 2.97, "elapsed_time": "0:19:42", "remaining_time": "10:44:22"}
87
+ {"current_steps": 830, "total_steps": 27630, "loss": 0.9779, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.978778819899109e-05, "epoch": 0.3, "percentage": 3.0, "elapsed_time": "0:19:50", "remaining_time": "10:40:40"}
88
+ {"current_steps": 840, "total_steps": 27630, "loss": 0.9564, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.978251988224804e-05, "epoch": 0.3, "percentage": 3.04, "elapsed_time": "0:19:58", "remaining_time": "10:36:56"}
89
+ {"current_steps": 850, "total_steps": 27630, "loss": 1.0275, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.977718711241233e-05, "epoch": 0.31, "percentage": 3.08, "elapsed_time": "0:20:06", "remaining_time": "10:33:39"}
90
+ {"current_steps": 860, "total_steps": 27630, "loss": 1.0293, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.977178989638822e-05, "epoch": 0.31, "percentage": 3.11, "elapsed_time": "0:20:16", "remaining_time": "10:31:04"}
91
+ {"current_steps": 870, "total_steps": 27630, "loss": 1.0508, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.97663282411635e-05, "epoch": 0.31, "percentage": 3.15, "elapsed_time": "0:20:24", "remaining_time": "10:27:45"}
92
+ {"current_steps": 880, "total_steps": 27630, "loss": 0.9949, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.97608021538093e-05, "epoch": 0.32, "percentage": 3.18, "elapsed_time": "0:20:32", "remaining_time": "10:24:34"}
93
+ {"current_steps": 890, "total_steps": 27630, "loss": 1.0752, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.97552116414802e-05, "epoch": 0.32, "percentage": 3.22, "elapsed_time": "0:20:40", "remaining_time": "10:21:23"}
94
+ {"current_steps": 900, "total_steps": 27630, "loss": 0.9947, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.974955671141424e-05, "epoch": 0.33, "percentage": 3.26, "elapsed_time": "0:20:48", "remaining_time": "10:17:58"}
95
+ {"current_steps": 910, "total_steps": 27630, "loss": 1.0362, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.974383737093279e-05, "epoch": 0.33, "percentage": 3.29, "elapsed_time": "0:20:56", "remaining_time": "10:15:04"}
96
+ {"current_steps": 920, "total_steps": 27630, "loss": 1.0469, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.973805362744064e-05, "epoch": 0.33, "percentage": 3.33, "elapsed_time": "0:21:04", "remaining_time": "10:11:41"}
97
+ {"current_steps": 930, "total_steps": 27630, "loss": 0.9705, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.973220548842598e-05, "epoch": 0.34, "percentage": 3.37, "elapsed_time": "0:21:14", "remaining_time": "10:09:43"}
98
+ {"current_steps": 940, "total_steps": 27630, "loss": 0.9956, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.972629296146035e-05, "epoch": 0.34, "percentage": 3.4, "elapsed_time": "0:21:22", "remaining_time": "10:07:08"}
99
+ {"current_steps": 950, "total_steps": 27630, "loss": 1.0232, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.972031605419864e-05, "epoch": 0.34, "percentage": 3.44, "elapsed_time": "0:21:32", "remaining_time": "10:05:05"}
100
+ {"current_steps": 960, "total_steps": 27630, "loss": 1.0471, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.971427477437914e-05, "epoch": 0.35, "percentage": 3.47, "elapsed_time": "0:21:40", "remaining_time": "10:02:21"}
101
+ {"current_steps": 970, "total_steps": 27630, "loss": 0.9652, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.970816912982344e-05, "epoch": 0.35, "percentage": 3.51, "elapsed_time": "0:21:49", "remaining_time": "9:59:46"}
102
+ {"current_steps": 980, "total_steps": 27630, "loss": 0.9894, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.970199912843648e-05, "epoch": 0.35, "percentage": 3.55, "elapsed_time": "0:21:57", "remaining_time": "9:57:16"}
103
+ {"current_steps": 990, "total_steps": 27630, "loss": 1.0437, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.96957647782065e-05, "epoch": 0.36, "percentage": 3.58, "elapsed_time": "0:22:05", "remaining_time": "9:54:30"}
104
+ {"current_steps": 1000, "total_steps": 27630, "loss": 1.02, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.968946608720511e-05, "epoch": 0.36, "percentage": 3.62, "elapsed_time": "0:22:14", "remaining_time": "9:52:27"}
105
+ {"current_steps": 1000, "total_steps": 27630, "loss": null, "eval_loss": 1.0154483318328857, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.36, "percentage": 3.62, "elapsed_time": "0:22:14", "remaining_time": "9:52:27"}
106
+ {"current_steps": 1010, "total_steps": 27630, "loss": 1.0676, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.968310306358715e-05, "epoch": 0.37, "percentage": 3.66, "elapsed_time": "0:24:28", "remaining_time": "10:45:00"}
107
+ {"current_steps": 1020, "total_steps": 27630, "loss": 1.027, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.967667571559081e-05, "epoch": 0.37, "percentage": 3.69, "elapsed_time": "0:24:36", "remaining_time": "10:42:05"}
108
+ {"current_steps": 1030, "total_steps": 27630, "loss": 1.0004, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.967018405153749e-05, "epoch": 0.37, "percentage": 3.73, "elapsed_time": "0:24:46", "remaining_time": "10:39:39"}
109
+ {"current_steps": 1040, "total_steps": 27630, "loss": 1.0395, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.966362807983196e-05, "epoch": 0.38, "percentage": 3.76, "elapsed_time": "0:24:56", "remaining_time": "10:37:29"}
110
+ {"current_steps": 1050, "total_steps": 27630, "loss": 0.9948, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.965700780896216e-05, "epoch": 0.38, "percentage": 3.8, "elapsed_time": "0:25:05", "remaining_time": "10:34:58"}
111
+ {"current_steps": 1060, "total_steps": 27630, "loss": 1.0281, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.965032324749932e-05, "epoch": 0.38, "percentage": 3.84, "elapsed_time": "0:25:12", "remaining_time": "10:31:58"}
112
+ {"current_steps": 1070, "total_steps": 27630, "loss": 1.0094, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.964357440409789e-05, "epoch": 0.39, "percentage": 3.87, "elapsed_time": "0:25:21", "remaining_time": "10:29:30"}
113
+ {"current_steps": 1080, "total_steps": 27630, "loss": 1.0272, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.963676128749553e-05, "epoch": 0.39, "percentage": 3.91, "elapsed_time": "0:25:28", "remaining_time": "10:26:21"}
114
+ {"current_steps": 1090, "total_steps": 27630, "loss": 1.0191, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.96298839065132e-05, "epoch": 0.39, "percentage": 3.94, "elapsed_time": "0:25:38", "remaining_time": "10:24:14"}
115
+ {"current_steps": 1100, "total_steps": 27630, "loss": 1.08, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.962294227005493e-05, "epoch": 0.4, "percentage": 3.98, "elapsed_time": "0:25:47", "remaining_time": "10:21:51"}
116
+ {"current_steps": 1110, "total_steps": 27630, "loss": 0.9954, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.961593638710804e-05, "epoch": 0.4, "percentage": 4.02, "elapsed_time": "0:25:55", "remaining_time": "10:19:25"}
117
+ {"current_steps": 1120, "total_steps": 27630, "loss": 1.071, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.960886626674302e-05, "epoch": 0.41, "percentage": 4.05, "elapsed_time": "0:26:03", "remaining_time": "10:16:47"}
118
+ {"current_steps": 1130, "total_steps": 27630, "loss": 0.9725, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.960173191811348e-05, "epoch": 0.41, "percentage": 4.09, "elapsed_time": "0:26:12", "remaining_time": "10:14:29"}
119
+ {"current_steps": 1140, "total_steps": 27630, "loss": 1.0071, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.959453335045622e-05, "epoch": 0.41, "percentage": 4.13, "elapsed_time": "0:26:19", "remaining_time": "10:11:46"}
120
+ {"current_steps": 1150, "total_steps": 27630, "loss": 1.0108, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.958727057309115e-05, "epoch": 0.42, "percentage": 4.16, "elapsed_time": "0:26:29", "remaining_time": "10:10:04"}
121
+ {"current_steps": 1160, "total_steps": 27630, "loss": 1.0495, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.957994359542138e-05, "epoch": 0.42, "percentage": 4.2, "elapsed_time": "0:26:37", "remaining_time": "10:07:40"}
122
+ {"current_steps": 1170, "total_steps": 27630, "loss": 1.0015, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.957255242693308e-05, "epoch": 0.42, "percentage": 4.23, "elapsed_time": "0:26:46", "remaining_time": "10:05:29"}
123
+ {"current_steps": 1180, "total_steps": 27630, "loss": 1.0559, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.956509707719555e-05, "epoch": 0.43, "percentage": 4.27, "elapsed_time": "0:26:54", "remaining_time": "10:03:17"}
124
+ {"current_steps": 1190, "total_steps": 27630, "loss": 1.0134, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.955757755586119e-05, "epoch": 0.43, "percentage": 4.31, "elapsed_time": "0:27:03", "remaining_time": "10:01:20"}
125
+ {"current_steps": 1200, "total_steps": 27630, "loss": 0.9492, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.954999387266546e-05, "epoch": 0.43, "percentage": 4.34, "elapsed_time": "0:27:11", "remaining_time": "9:58:47"}
126
+ {"current_steps": 1200, "total_steps": 27630, "loss": null, "eval_loss": 1.0133627653121948, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.43, "percentage": 4.34, "elapsed_time": "0:27:11", "remaining_time": "9:58:47"}
127
+ {"current_steps": 1210, "total_steps": 27630, "loss": 0.9629, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.95423460374269e-05, "epoch": 0.44, "percentage": 4.38, "elapsed_time": "0:29:23", "remaining_time": "10:41:38"}
128
+ {"current_steps": 1220, "total_steps": 27630, "loss": 1.0384, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.953463406004713e-05, "epoch": 0.44, "percentage": 4.42, "elapsed_time": "0:29:30", "remaining_time": "10:38:41"}
129
+ {"current_steps": 1230, "total_steps": 27630, "loss": 1.0235, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.952685795051077e-05, "epoch": 0.45, "percentage": 4.45, "elapsed_time": "0:29:38", "remaining_time": "10:36:02"}
130
+ {"current_steps": 1240, "total_steps": 27630, "loss": 1.0395, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.951901771888552e-05, "epoch": 0.45, "percentage": 4.49, "elapsed_time": "0:29:46", "remaining_time": "10:33:40"}
131
+ {"current_steps": 1250, "total_steps": 27630, "loss": 1.0914, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.951111337532205e-05, "epoch": 0.45, "percentage": 4.52, "elapsed_time": "0:29:55", "remaining_time": "10:31:31"}
132
+ {"current_steps": 1260, "total_steps": 27630, "loss": 1.0714, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.950314493005408e-05, "epoch": 0.46, "percentage": 4.56, "elapsed_time": "0:30:03", "remaining_time": "10:28:55"}
133
+ {"current_steps": 1270, "total_steps": 27630, "loss": 1.0224, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.949511239339831e-05, "epoch": 0.46, "percentage": 4.6, "elapsed_time": "0:30:11", "remaining_time": "10:26:30"}
134
+ {"current_steps": 1280, "total_steps": 27630, "loss": 1.0152, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.948701577575439e-05, "epoch": 0.46, "percentage": 4.63, "elapsed_time": "0:30:21", "remaining_time": "10:24:49"}
135
+ {"current_steps": 1290, "total_steps": 27630, "loss": 0.8988, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.947885508760496e-05, "epoch": 0.47, "percentage": 4.67, "elapsed_time": "0:30:31", "remaining_time": "10:23:07"}
136
+ {"current_steps": 1300, "total_steps": 27630, "loss": 1.0242, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.94706303395156e-05, "epoch": 0.47, "percentage": 4.71, "elapsed_time": "0:30:39", "remaining_time": "10:20:49"}
137
+ {"current_steps": 1310, "total_steps": 27630, "loss": 1.0145, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.946234154213487e-05, "epoch": 0.47, "percentage": 4.74, "elapsed_time": "0:30:47", "remaining_time": "10:18:45"}
138
+ {"current_steps": 1320, "total_steps": 27630, "loss": 1.0197, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.94539887061942e-05, "epoch": 0.48, "percentage": 4.78, "elapsed_time": "0:30:55", "remaining_time": "10:16:20"}
139
+ {"current_steps": 1330, "total_steps": 27630, "loss": 1.0273, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.944557184250794e-05, "epoch": 0.48, "percentage": 4.81, "elapsed_time": "0:31:03", "remaining_time": "10:14:07"}
140
+ {"current_steps": 1340, "total_steps": 27630, "loss": 0.9561, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.943709096197335e-05, "epoch": 0.48, "percentage": 4.85, "elapsed_time": "0:31:11", "remaining_time": "10:12:06"}
141
+ {"current_steps": 1350, "total_steps": 27630, "loss": 0.9678, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.942854607557057e-05, "epoch": 0.49, "percentage": 4.89, "elapsed_time": "0:31:21", "remaining_time": "10:10:33"}
142
+ {"current_steps": 1360, "total_steps": 27630, "loss": 1.0429, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.941993719436262e-05, "epoch": 0.49, "percentage": 4.92, "elapsed_time": "0:31:31", "remaining_time": "10:08:59"}
143
+ {"current_steps": 1370, "total_steps": 27630, "loss": 1.0506, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.941126432949535e-05, "epoch": 0.5, "percentage": 4.96, "elapsed_time": "0:31:39", "remaining_time": "10:06:55"}
144
+ {"current_steps": 1380, "total_steps": 27630, "loss": 1.0326, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.940252749219746e-05, "epoch": 0.5, "percentage": 4.99, "elapsed_time": "0:31:48", "remaining_time": "10:04:55"}
145
+ {"current_steps": 1390, "total_steps": 27630, "loss": 1.0413, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.939372669378048e-05, "epoch": 0.5, "percentage": 5.03, "elapsed_time": "0:31:57", "remaining_time": "10:03:13"}
146
+ {"current_steps": 1400, "total_steps": 27630, "loss": 1.0051, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.938486194563875e-05, "epoch": 0.51, "percentage": 5.07, "elapsed_time": "0:32:04", "remaining_time": "10:00:57"}
147
+ {"current_steps": 1400, "total_steps": 27630, "loss": null, "eval_loss": 1.011703372001648, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.51, "percentage": 5.07, "elapsed_time": "0:32:04", "remaining_time": "10:00:57"}
148
+ {"current_steps": 1410, "total_steps": 27630, "loss": 1.0277, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.937593325924937e-05, "epoch": 0.51, "percentage": 5.1, "elapsed_time": "0:34:18", "remaining_time": "10:37:53"}
149
+ {"current_steps": 1420, "total_steps": 27630, "loss": 0.9802, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.936694064617227e-05, "epoch": 0.51, "percentage": 5.14, "elapsed_time": "0:34:26", "remaining_time": "10:35:44"}
150
+ {"current_steps": 1430, "total_steps": 27630, "loss": 0.9811, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.935788411805011e-05, "epoch": 0.52, "percentage": 5.18, "elapsed_time": "0:34:34", "remaining_time": "10:33:33"}
151
+ {"current_steps": 1440, "total_steps": 27630, "loss": 0.9972, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.934876368660836e-05, "epoch": 0.52, "percentage": 5.21, "elapsed_time": "0:34:43", "remaining_time": "10:31:27"}
152
+ {"current_steps": 1450, "total_steps": 27630, "loss": 1.1006, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.933957936365515e-05, "epoch": 0.52, "percentage": 5.25, "elapsed_time": "0:34:51", "remaining_time": "10:29:14"}
153
+ {"current_steps": 1460, "total_steps": 27630, "loss": 1.0139, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.933033116108134e-05, "epoch": 0.53, "percentage": 5.28, "elapsed_time": "0:35:01", "remaining_time": "10:27:40"}
154
+ {"current_steps": 1470, "total_steps": 27630, "loss": 0.993, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.932101909086056e-05, "epoch": 0.53, "percentage": 5.32, "elapsed_time": "0:35:08", "remaining_time": "10:25:20"}
155
+ {"current_steps": 1480, "total_steps": 27630, "loss": 1.0539, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.931164316504904e-05, "epoch": 0.54, "percentage": 5.36, "elapsed_time": "0:35:16", "remaining_time": "10:23:09"}
156
+ {"current_steps": 1490, "total_steps": 27630, "loss": 0.9599, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.930220339578576e-05, "epoch": 0.54, "percentage": 5.39, "elapsed_time": "0:35:24", "remaining_time": "10:21:18"}
157
+ {"current_steps": 1500, "total_steps": 27630, "loss": 0.9813, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.929269979529232e-05, "epoch": 0.54, "percentage": 5.43, "elapsed_time": "0:35:33", "remaining_time": "10:19:20"}
158
+ {"current_steps": 1510, "total_steps": 27630, "loss": 0.9637, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.928313237587296e-05, "epoch": 0.55, "percentage": 5.47, "elapsed_time": "0:35:44", "remaining_time": "10:18:22"}
159
+ {"current_steps": 1520, "total_steps": 27630, "loss": 1.0375, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.927350114991456e-05, "epoch": 0.55, "percentage": 5.5, "elapsed_time": "0:35:55", "remaining_time": "10:17:09"}
160
+ {"current_steps": 1530, "total_steps": 27630, "loss": 1.0053, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.92638061298866e-05, "epoch": 0.55, "percentage": 5.54, "elapsed_time": "0:36:04", "remaining_time": "10:15:22"}
161
+ {"current_steps": 1540, "total_steps": 27630, "loss": 1.0631, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.925404732834117e-05, "epoch": 0.56, "percentage": 5.57, "elapsed_time": "0:36:13", "remaining_time": "10:13:49"}
162
+ {"current_steps": 1550, "total_steps": 27630, "loss": 1.0134, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.924422475791288e-05, "epoch": 0.56, "percentage": 5.61, "elapsed_time": "0:36:21", "remaining_time": "10:11:51"}
163
+ {"current_steps": 1560, "total_steps": 27630, "loss": 0.9989, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.923433843131901e-05, "epoch": 0.56, "percentage": 5.65, "elapsed_time": "0:36:31", "remaining_time": "10:10:17"}
164
+ {"current_steps": 1570, "total_steps": 27630, "loss": 1.0896, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.922438836135928e-05, "epoch": 0.57, "percentage": 5.68, "elapsed_time": "0:36:39", "remaining_time": "10:08:27"}
165
+ {"current_steps": 1580, "total_steps": 27630, "loss": 0.9954, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.921437456091596e-05, "epoch": 0.57, "percentage": 5.72, "elapsed_time": "0:36:46", "remaining_time": "10:06:19"}
166
+ {"current_steps": 1590, "total_steps": 27630, "loss": 0.9937, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.920429704295391e-05, "epoch": 0.58, "percentage": 5.75, "elapsed_time": "0:36:55", "remaining_time": "10:04:36"}
167
+ {"current_steps": 1600, "total_steps": 27630, "loss": 1.0469, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.919415582052036e-05, "epoch": 0.58, "percentage": 5.79, "elapsed_time": "0:37:03", "remaining_time": "10:02:49"}
168
+ {"current_steps": 1600, "total_steps": 27630, "loss": null, "eval_loss": 1.0105613470077515, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.58, "percentage": 5.79, "elapsed_time": "0:37:03", "remaining_time": "10:02:49"}
169
+ {"current_steps": 1610, "total_steps": 27630, "loss": 1.0408, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.918395090674514e-05, "epoch": 0.58, "percentage": 5.83, "elapsed_time": "0:39:15", "remaining_time": "10:34:34"}
170
+ {"current_steps": 1620, "total_steps": 27630, "loss": 0.9893, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.917368231484045e-05, "epoch": 0.59, "percentage": 5.86, "elapsed_time": "0:39:23", "remaining_time": "10:32:35"}
171
+ {"current_steps": 1630, "total_steps": 27630, "loss": 1.0563, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.916335005810095e-05, "epoch": 0.59, "percentage": 5.9, "elapsed_time": "0:39:31", "remaining_time": "10:30:31"}
172
+ {"current_steps": 1640, "total_steps": 27630, "loss": 1.0061, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.91529541499038e-05, "epoch": 0.59, "percentage": 5.94, "elapsed_time": "0:39:39", "remaining_time": "10:28:27"}
173
+ {"current_steps": 1650, "total_steps": 27630, "loss": 0.9639, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.914249460370846e-05, "epoch": 0.6, "percentage": 5.97, "elapsed_time": "0:39:47", "remaining_time": "10:26:37"}
174
+ {"current_steps": 1660, "total_steps": 27630, "loss": 1.0289, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.913197143305684e-05, "epoch": 0.6, "percentage": 6.01, "elapsed_time": "0:39:57", "remaining_time": "10:25:07"}
175
+ {"current_steps": 1670, "total_steps": 27630, "loss": 1.0154, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.912138465157325e-05, "epoch": 0.6, "percentage": 6.04, "elapsed_time": "0:40:05", "remaining_time": "10:23:11"}
176
+ {"current_steps": 1680, "total_steps": 27630, "loss": 1.0002, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.91107342729643e-05, "epoch": 0.61, "percentage": 6.08, "elapsed_time": "0:40:13", "remaining_time": "10:21:23"}
177
+ {"current_steps": 1690, "total_steps": 27630, "loss": 0.9887, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.910002031101895e-05, "epoch": 0.61, "percentage": 6.12, "elapsed_time": "0:40:22", "remaining_time": "10:19:40"}
178
+ {"current_steps": 1700, "total_steps": 27630, "loss": 1.0703, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.908924277960854e-05, "epoch": 0.62, "percentage": 6.15, "elapsed_time": "0:40:30", "remaining_time": "10:17:52"}
179
+ {"current_steps": 1710, "total_steps": 27630, "loss": 0.9495, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.907840169268662e-05, "epoch": 0.62, "percentage": 6.19, "elapsed_time": "0:40:38", "remaining_time": "10:16:03"}
180
+ {"current_steps": 1720, "total_steps": 27630, "loss": 0.9878, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.90674970642891e-05, "epoch": 0.62, "percentage": 6.23, "elapsed_time": "0:40:47", "remaining_time": "10:14:30"}
181
+ {"current_steps": 1730, "total_steps": 27630, "loss": 1.0351, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.905652890853411e-05, "epoch": 0.63, "percentage": 6.26, "elapsed_time": "0:40:55", "remaining_time": "10:12:47"}
182
+ {"current_steps": 1740, "total_steps": 27630, "loss": 1.0528, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.904549723962206e-05, "epoch": 0.63, "percentage": 6.3, "elapsed_time": "0:41:03", "remaining_time": "10:10:58"}
183
+ {"current_steps": 1750, "total_steps": 27630, "loss": 1.0159, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.903440207183558e-05, "epoch": 0.63, "percentage": 6.33, "elapsed_time": "0:41:11", "remaining_time": "10:09:11"}
184
+ {"current_steps": 1760, "total_steps": 27630, "loss": 0.9879, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.90232434195395e-05, "epoch": 0.64, "percentage": 6.37, "elapsed_time": "0:41:19", "remaining_time": "10:07:26"}
185
+ {"current_steps": 1770, "total_steps": 27630, "loss": 1.0221, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.901202129718086e-05, "epoch": 0.64, "percentage": 6.41, "elapsed_time": "0:41:27", "remaining_time": "10:05:44"}
186
+ {"current_steps": 1780, "total_steps": 27630, "loss": 1.037, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.900073571928886e-05, "epoch": 0.64, "percentage": 6.44, "elapsed_time": "0:41:35", "remaining_time": "10:04:02"}
187
+ {"current_steps": 1790, "total_steps": 27630, "loss": 1.0008, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.898938670047486e-05, "epoch": 0.65, "percentage": 6.48, "elapsed_time": "0:41:46", "remaining_time": "10:02:58"}
188
+ {"current_steps": 1800, "total_steps": 27630, "loss": 0.9994, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.897797425543236e-05, "epoch": 0.65, "percentage": 6.51, "elapsed_time": "0:41:53", "remaining_time": "10:01:10"}
189
+ {"current_steps": 1800, "total_steps": 27630, "loss": null, "eval_loss": 1.0094062089920044, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.65, "percentage": 6.51, "elapsed_time": "0:41:53", "remaining_time": "10:01:10"}
190
+ {"current_steps": 1810, "total_steps": 27630, "loss": 1.0093, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.896649839893699e-05, "epoch": 0.66, "percentage": 6.55, "elapsed_time": "0:44:07", "remaining_time": "10:29:26"}
191
+ {"current_steps": 1820, "total_steps": 27630, "loss": 1.0124, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.895495914584643e-05, "epoch": 0.66, "percentage": 6.59, "elapsed_time": "0:44:15", "remaining_time": "10:27:41"}
192
+ {"current_steps": 1830, "total_steps": 27630, "loss": 1.0197, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.894335651110051e-05, "epoch": 0.66, "percentage": 6.62, "elapsed_time": "0:44:23", "remaining_time": "10:25:57"}
193
+ {"current_steps": 1840, "total_steps": 27630, "loss": 0.9469, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.893169050972106e-05, "epoch": 0.67, "percentage": 6.66, "elapsed_time": "0:44:33", "remaining_time": "10:24:30"}
194
+ {"current_steps": 1850, "total_steps": 27630, "loss": 1.0153, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.8919961156812e-05, "epoch": 0.67, "percentage": 6.7, "elapsed_time": "0:44:41", "remaining_time": "10:22:41"}
195
+ {"current_steps": 1860, "total_steps": 27630, "loss": 1.0124, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.89081684675592e-05, "epoch": 0.67, "percentage": 6.73, "elapsed_time": "0:44:49", "remaining_time": "10:21:03"}
196
+ {"current_steps": 1870, "total_steps": 27630, "loss": 0.96, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.88963124572306e-05, "epoch": 0.68, "percentage": 6.77, "elapsed_time": "0:44:55", "remaining_time": "10:18:56"}
197
+ {"current_steps": 1880, "total_steps": 27630, "loss": 1.0448, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.88843931411761e-05, "epoch": 0.68, "percentage": 6.8, "elapsed_time": "0:45:03", "remaining_time": "10:17:11"}
198
+ {"current_steps": 1890, "total_steps": 27630, "loss": 1.0362, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.887241053482757e-05, "epoch": 0.68, "percentage": 6.84, "elapsed_time": "0:45:12", "remaining_time": "10:15:43"}
199
+ {"current_steps": 1900, "total_steps": 27630, "loss": 1.0658, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.886036465369877e-05, "epoch": 0.69, "percentage": 6.88, "elapsed_time": "0:45:19", "remaining_time": "10:13:49"}
200
+ {"current_steps": 1910, "total_steps": 27630, "loss": 1.0068, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.884825551338546e-05, "epoch": 0.69, "percentage": 6.91, "elapsed_time": "0:45:26", "remaining_time": "10:12:00"}
201
+ {"current_steps": 1920, "total_steps": 27630, "loss": 1.0147, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.883608312956524e-05, "epoch": 0.69, "percentage": 6.95, "elapsed_time": "0:45:36", "remaining_time": "10:10:38"}
202
+ {"current_steps": 1930, "total_steps": 27630, "loss": 0.9421, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.882384751799762e-05, "epoch": 0.7, "percentage": 6.99, "elapsed_time": "0:45:45", "remaining_time": "10:09:14"}
203
+ {"current_steps": 1940, "total_steps": 27630, "loss": 1.0032, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.881154869452395e-05, "epoch": 0.7, "percentage": 7.02, "elapsed_time": "0:45:52", "remaining_time": "10:07:33"}
204
+ {"current_steps": 1950, "total_steps": 27630, "loss": 1.0491, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.879918667506748e-05, "epoch": 0.71, "percentage": 7.06, "elapsed_time": "0:46:00", "remaining_time": "10:05:54"}
205
+ {"current_steps": 1960, "total_steps": 27630, "loss": 0.9823, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.87867614756332e-05, "epoch": 0.71, "percentage": 7.09, "elapsed_time": "0:46:07", "remaining_time": "10:04:10"}
206
+ {"current_steps": 1970, "total_steps": 27630, "loss": 1.0326, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.87742731123079e-05, "epoch": 0.71, "percentage": 7.13, "elapsed_time": "0:46:16", "remaining_time": "10:02:44"}
207
+ {"current_steps": 1980, "total_steps": 27630, "loss": 1.0256, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.876172160126024e-05, "epoch": 0.72, "percentage": 7.17, "elapsed_time": "0:46:24", "remaining_time": "10:01:13"}
208
+ {"current_steps": 1990, "total_steps": 27630, "loss": 1.0301, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.874910695874053e-05, "epoch": 0.72, "percentage": 7.2, "elapsed_time": "0:46:32", "remaining_time": "9:59:35"}
209
+ {"current_steps": 2000, "total_steps": 27630, "loss": 1.0141, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.873642920108091e-05, "epoch": 0.72, "percentage": 7.24, "elapsed_time": "0:46:39", "remaining_time": "9:58:00"}
210
+ {"current_steps": 2000, "total_steps": 27630, "loss": null, "eval_loss": 1.0082145929336548, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.72, "percentage": 7.24, "elapsed_time": "0:46:39", "remaining_time": "9:58:00"}
211
+ {"current_steps": 2010, "total_steps": 27630, "loss": 0.9554, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.872368834469514e-05, "epoch": 0.73, "percentage": 7.27, "elapsed_time": "0:48:54", "remaining_time": "10:23:26"}
212
+ {"current_steps": 2020, "total_steps": 27630, "loss": 1.0374, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.871088440607874e-05, "epoch": 0.73, "percentage": 7.31, "elapsed_time": "0:49:02", "remaining_time": "10:21:40"}
213
+ {"current_steps": 2030, "total_steps": 27630, "loss": 1.01, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.869801740180889e-05, "epoch": 0.73, "percentage": 7.35, "elapsed_time": "0:49:10", "remaining_time": "10:20:12"}
214
+ {"current_steps": 2040, "total_steps": 27630, "loss": 1.0244, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.86850873485444e-05, "epoch": 0.74, "percentage": 7.38, "elapsed_time": "0:49:19", "remaining_time": "10:18:42"}
215
+ {"current_steps": 2050, "total_steps": 27630, "loss": 0.9303, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.867209426302572e-05, "epoch": 0.74, "percentage": 7.42, "elapsed_time": "0:49:26", "remaining_time": "10:17:01"}
216
+ {"current_steps": 2060, "total_steps": 27630, "loss": 1.0851, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.865903816207493e-05, "epoch": 0.75, "percentage": 7.46, "elapsed_time": "0:49:34", "remaining_time": "10:15:27"}
217
+ {"current_steps": 2070, "total_steps": 27630, "loss": 1.0042, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.864591906259568e-05, "epoch": 0.75, "percentage": 7.49, "elapsed_time": "0:49:41", "remaining_time": "10:13:38"}
218
+ {"current_steps": 2080, "total_steps": 27630, "loss": 0.9866, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.863273698157315e-05, "epoch": 0.75, "percentage": 7.53, "elapsed_time": "0:49:52", "remaining_time": "10:12:39"}
219
+ {"current_steps": 2090, "total_steps": 27630, "loss": 1.056, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.861949193607411e-05, "epoch": 0.76, "percentage": 7.56, "elapsed_time": "0:50:00", "remaining_time": "10:11:01"}
220
+ {"current_steps": 2100, "total_steps": 27630, "loss": 0.9988, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.860618394324682e-05, "epoch": 0.76, "percentage": 7.6, "elapsed_time": "0:50:07", "remaining_time": "10:09:24"}
221
+ {"current_steps": 2110, "total_steps": 27630, "loss": 0.9562, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.859281302032106e-05, "epoch": 0.76, "percentage": 7.64, "elapsed_time": "0:50:15", "remaining_time": "10:07:56"}
222
+ {"current_steps": 2120, "total_steps": 27630, "loss": 1.0325, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.857937918460808e-05, "epoch": 0.77, "percentage": 7.67, "elapsed_time": "0:50:24", "remaining_time": "10:06:30"}
223
+ {"current_steps": 2130, "total_steps": 27630, "loss": 1.0458, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.856588245350056e-05, "epoch": 0.77, "percentage": 7.71, "elapsed_time": "0:50:33", "remaining_time": "10:05:15"}
224
+ {"current_steps": 2140, "total_steps": 27630, "loss": 1.089, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.855232284447262e-05, "epoch": 0.77, "percentage": 7.75, "elapsed_time": "0:50:41", "remaining_time": "10:03:44"}
225
+ {"current_steps": 2150, "total_steps": 27630, "loss": 1.0398, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.853870037507983e-05, "epoch": 0.78, "percentage": 7.78, "elapsed_time": "0:50:48", "remaining_time": "10:02:11"}
226
+ {"current_steps": 2160, "total_steps": 27630, "loss": 1.0038, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.852501506295907e-05, "epoch": 0.78, "percentage": 7.82, "elapsed_time": "0:50:57", "remaining_time": "10:00:53"}
227
+ {"current_steps": 2170, "total_steps": 27630, "loss": 1.0343, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.851126692582864e-05, "epoch": 0.79, "percentage": 7.85, "elapsed_time": "0:51:05", "remaining_time": "9:59:23"}
228
+ {"current_steps": 2180, "total_steps": 27630, "loss": 0.9986, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.849745598148817e-05, "epoch": 0.79, "percentage": 7.89, "elapsed_time": "0:51:13", "remaining_time": "9:57:58"}
229
+ {"current_steps": 2190, "total_steps": 27630, "loss": 1.035, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.848358224781857e-05, "epoch": 0.79, "percentage": 7.93, "elapsed_time": "0:51:22", "remaining_time": "9:56:49"}
230
+ {"current_steps": 2200, "total_steps": 27630, "loss": 1.0891, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.84696457427821e-05, "epoch": 0.8, "percentage": 7.96, "elapsed_time": "0:51:30", "remaining_time": "9:55:20"}
231
+ {"current_steps": 2200, "total_steps": 27630, "loss": null, "eval_loss": 1.0072919130325317, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.8, "percentage": 7.96, "elapsed_time": "0:51:30", "remaining_time": "9:55:20"}
232
+ {"current_steps": 2210, "total_steps": 27630, "loss": 1.0259, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.845564648442222e-05, "epoch": 0.8, "percentage": 8.0, "elapsed_time": "0:53:43", "remaining_time": "10:18:02"}
233
+ {"current_steps": 2220, "total_steps": 27630, "loss": 1.0457, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.844158449086371e-05, "epoch": 0.8, "percentage": 8.03, "elapsed_time": "0:53:53", "remaining_time": "10:16:55"}
234
+ {"current_steps": 2230, "total_steps": 27630, "loss": 0.9869, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.842745978031253e-05, "epoch": 0.81, "percentage": 8.07, "elapsed_time": "0:54:02", "remaining_time": "10:15:34"}
235
+ {"current_steps": 2240, "total_steps": 27630, "loss": 1.0158, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.841327237105585e-05, "epoch": 0.81, "percentage": 8.11, "elapsed_time": "0:54:12", "remaining_time": "10:14:31"}
236
+ {"current_steps": 2250, "total_steps": 27630, "loss": 0.997, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.8399022281462e-05, "epoch": 0.81, "percentage": 8.14, "elapsed_time": "0:54:21", "remaining_time": "10:13:08"}
237
+ {"current_steps": 2260, "total_steps": 27630, "loss": 1.0148, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.838470952998049e-05, "epoch": 0.82, "percentage": 8.18, "elapsed_time": "0:54:30", "remaining_time": "10:11:51"}
238
+ {"current_steps": 2270, "total_steps": 27630, "loss": 0.9787, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.837033413514191e-05, "epoch": 0.82, "percentage": 8.22, "elapsed_time": "0:54:39", "remaining_time": "10:10:37"}
239
+ {"current_steps": 2280, "total_steps": 27630, "loss": 0.9656, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.835589611555805e-05, "epoch": 0.83, "percentage": 8.25, "elapsed_time": "0:54:47", "remaining_time": "10:09:16"}
240
+ {"current_steps": 2290, "total_steps": 27630, "loss": 0.9837, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.834139548992165e-05, "epoch": 0.83, "percentage": 8.29, "elapsed_time": "0:54:55", "remaining_time": "10:07:49"}
241
+ {"current_steps": 2300, "total_steps": 27630, "loss": 1.0513, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.832683227700661e-05, "epoch": 0.83, "percentage": 8.32, "elapsed_time": "0:55:04", "remaining_time": "10:06:37"}
242
+ {"current_steps": 2310, "total_steps": 27630, "loss": 0.9649, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.831220649566782e-05, "epoch": 0.84, "percentage": 8.36, "elapsed_time": "0:55:13", "remaining_time": "10:05:20"}
243
+ {"current_steps": 2320, "total_steps": 27630, "loss": 1.0208, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.829751816484116e-05, "epoch": 0.84, "percentage": 8.4, "elapsed_time": "0:55:20", "remaining_time": "10:03:48"}
244
+ {"current_steps": 2330, "total_steps": 27630, "loss": 0.9512, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.828276730354353e-05, "epoch": 0.84, "percentage": 8.43, "elapsed_time": "0:55:30", "remaining_time": "10:02:46"}
245
+ {"current_steps": 2340, "total_steps": 27630, "loss": 0.976, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.826795393087278e-05, "epoch": 0.85, "percentage": 8.47, "elapsed_time": "0:55:40", "remaining_time": "10:01:45"}
246
+ {"current_steps": 2350, "total_steps": 27630, "loss": 1.0036, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.825307806600765e-05, "epoch": 0.85, "percentage": 8.51, "elapsed_time": "0:55:50", "remaining_time": "10:00:40"}
247
+ {"current_steps": 2360, "total_steps": 27630, "loss": 1.0555, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.823813972820786e-05, "epoch": 0.85, "percentage": 8.54, "elapsed_time": "0:55:58", "remaining_time": "9:59:17"}
248
+ {"current_steps": 2370, "total_steps": 27630, "loss": 1.0483, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.822313893681397e-05, "epoch": 0.86, "percentage": 8.58, "elapsed_time": "0:56:06", "remaining_time": "9:58:06"}
249
+ {"current_steps": 2380, "total_steps": 27630, "loss": 1.0102, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.820807571124738e-05, "epoch": 0.86, "percentage": 8.61, "elapsed_time": "0:56:15", "remaining_time": "9:56:48"}
250
+ {"current_steps": 2390, "total_steps": 27630, "loss": 1.0626, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.819295007101035e-05, "epoch": 0.87, "percentage": 8.65, "elapsed_time": "0:56:24", "remaining_time": "9:55:44"}
251
+ {"current_steps": 2400, "total_steps": 27630, "loss": 1.0141, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.817776203568596e-05, "epoch": 0.87, "percentage": 8.69, "elapsed_time": "0:56:33", "remaining_time": "9:54:36"}
252
+ {"current_steps": 2400, "total_steps": 27630, "loss": null, "eval_loss": 1.0063296556472778, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.87, "percentage": 8.69, "elapsed_time": "0:56:33", "remaining_time": "9:54:36"}
253
+ {"current_steps": 2410, "total_steps": 27630, "loss": 1.0222, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.816251162493804e-05, "epoch": 0.87, "percentage": 8.72, "elapsed_time": "0:58:48", "remaining_time": "10:15:26"}
254
+ {"current_steps": 2420, "total_steps": 27630, "loss": 0.9891, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.814719885851121e-05, "epoch": 0.88, "percentage": 8.76, "elapsed_time": "0:58:56", "remaining_time": "10:14:04"}
255
+ {"current_steps": 2430, "total_steps": 27630, "loss": 0.9785, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.81318237562308e-05, "epoch": 0.88, "percentage": 8.79, "elapsed_time": "0:59:06", "remaining_time": "10:12:57"}
256
+ {"current_steps": 2440, "total_steps": 27630, "loss": 0.9357, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.811638633800287e-05, "epoch": 0.88, "percentage": 8.83, "elapsed_time": "0:59:13", "remaining_time": "10:11:24"}
257
+ {"current_steps": 2450, "total_steps": 27630, "loss": 1.0485, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.81008866238141e-05, "epoch": 0.89, "percentage": 8.87, "elapsed_time": "0:59:23", "remaining_time": "10:10:19"}
258
+ {"current_steps": 2460, "total_steps": 27630, "loss": 1.0138, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.808532463373188e-05, "epoch": 0.89, "percentage": 8.9, "elapsed_time": "0:59:32", "remaining_time": "10:09:11"}
259
+ {"current_steps": 2470, "total_steps": 27630, "loss": 1.0421, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.806970038790423e-05, "epoch": 0.89, "percentage": 8.94, "elapsed_time": "0:59:41", "remaining_time": "10:08:01"}
260
+ {"current_steps": 2480, "total_steps": 27630, "loss": 0.9926, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.805401390655975e-05, "epoch": 0.9, "percentage": 8.98, "elapsed_time": "0:59:49", "remaining_time": "10:06:39"}
261
+ {"current_steps": 2490, "total_steps": 27630, "loss": 1.0013, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.803826521000761e-05, "epoch": 0.9, "percentage": 9.01, "elapsed_time": "0:59:57", "remaining_time": "10:05:25"}
262
+ {"current_steps": 2500, "total_steps": 27630, "loss": 0.9937, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.802245431863757e-05, "epoch": 0.9, "percentage": 9.05, "elapsed_time": "1:00:05", "remaining_time": "10:04:01"}
263
+ {"current_steps": 2510, "total_steps": 27630, "loss": 0.9986, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.800658125291984e-05, "epoch": 0.91, "percentage": 9.08, "elapsed_time": "1:00:13", "remaining_time": "10:02:39"}
264
+ {"current_steps": 2520, "total_steps": 27630, "loss": 0.9984, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.79906460334052e-05, "epoch": 0.91, "percentage": 9.12, "elapsed_time": "1:00:20", "remaining_time": "10:01:16"}
265
+ {"current_steps": 2530, "total_steps": 27630, "loss": 1.0273, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.797464868072488e-05, "epoch": 0.92, "percentage": 9.16, "elapsed_time": "1:00:28", "remaining_time": "9:59:57"}
266
+ {"current_steps": 2540, "total_steps": 27630, "loss": 1.0346, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.795858921559052e-05, "epoch": 0.92, "percentage": 9.19, "elapsed_time": "1:00:36", "remaining_time": "9:58:43"}
267
+ {"current_steps": 2550, "total_steps": 27630, "loss": 1.0691, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.79424676587942e-05, "epoch": 0.92, "percentage": 9.23, "elapsed_time": "1:00:44", "remaining_time": "9:57:29"}
268
+ {"current_steps": 2560, "total_steps": 27630, "loss": 1.009, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.792628403120842e-05, "epoch": 0.93, "percentage": 9.27, "elapsed_time": "1:00:52", "remaining_time": "9:56:05"}
269
+ {"current_steps": 2570, "total_steps": 27630, "loss": 1.0015, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.791003835378598e-05, "epoch": 0.93, "percentage": 9.3, "elapsed_time": "1:01:01", "remaining_time": "9:55:00"}
270
+ {"current_steps": 2580, "total_steps": 27630, "loss": 1.0177, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.789373064756008e-05, "epoch": 0.93, "percentage": 9.34, "elapsed_time": "1:01:10", "remaining_time": "9:53:54"}
271
+ {"current_steps": 2590, "total_steps": 27630, "loss": 1.0935, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.787736093364416e-05, "epoch": 0.94, "percentage": 9.37, "elapsed_time": "1:01:20", "remaining_time": "9:53:02"}
272
+ {"current_steps": 2600, "total_steps": 27630, "loss": 1.0002, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.786092923323203e-05, "epoch": 0.94, "percentage": 9.41, "elapsed_time": "1:01:28", "remaining_time": "9:51:52"}
273
+ {"current_steps": 2600, "total_steps": 27630, "loss": null, "eval_loss": 1.005922555923462, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.94, "percentage": 9.41, "elapsed_time": "1:01:28", "remaining_time": "9:51:52"}
274
+ {"current_steps": 2610, "total_steps": 27630, "loss": 1.0305, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.784443556759766e-05, "epoch": 0.94, "percentage": 9.45, "elapsed_time": "1:03:42", "remaining_time": "10:10:38"}
275
+ {"current_steps": 2620, "total_steps": 27630, "loss": 1.0427, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.78278799580953e-05, "epoch": 0.95, "percentage": 9.48, "elapsed_time": "1:03:51", "remaining_time": "10:09:36"}
276
+ {"current_steps": 2630, "total_steps": 27630, "loss": 1.0059, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.781126242615939e-05, "epoch": 0.95, "percentage": 9.52, "elapsed_time": "1:04:01", "remaining_time": "10:08:35"}
277
+ {"current_steps": 2640, "total_steps": 27630, "loss": 1.0418, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.779458299330452e-05, "epoch": 0.96, "percentage": 9.55, "elapsed_time": "1:04:09", "remaining_time": "10:07:23"}
278
+ {"current_steps": 2650, "total_steps": 27630, "loss": 1.0092, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.777784168112545e-05, "epoch": 0.96, "percentage": 9.59, "elapsed_time": "1:04:19", "remaining_time": "10:06:19"}
279
+ {"current_steps": 2660, "total_steps": 27630, "loss": 0.9883, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.776103851129706e-05, "epoch": 0.96, "percentage": 9.63, "elapsed_time": "1:04:28", "remaining_time": "10:05:10"}
280
+ {"current_steps": 2670, "total_steps": 27630, "loss": 1.0753, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.774417350557428e-05, "epoch": 0.97, "percentage": 9.66, "elapsed_time": "1:04:38", "remaining_time": "10:04:17"}
281
+ {"current_steps": 2680, "total_steps": 27630, "loss": 1.0524, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.772724668579212e-05, "epoch": 0.97, "percentage": 9.7, "elapsed_time": "1:04:48", "remaining_time": "10:03:23"}
282
+ {"current_steps": 2690, "total_steps": 27630, "loss": 1.0562, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.771025807386562e-05, "epoch": 0.97, "percentage": 9.74, "elapsed_time": "1:04:56", "remaining_time": "10:02:03"}
283
+ {"current_steps": 2700, "total_steps": 27630, "loss": 0.9925, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.769320769178983e-05, "epoch": 0.98, "percentage": 9.77, "elapsed_time": "1:05:03", "remaining_time": "10:00:45"}
284
+ {"current_steps": 2710, "total_steps": 27630, "loss": 1.014, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.767609556163977e-05, "epoch": 0.98, "percentage": 9.81, "elapsed_time": "1:05:11", "remaining_time": "9:59:31"}
285
+ {"current_steps": 2720, "total_steps": 27630, "loss": 0.9677, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.765892170557038e-05, "epoch": 0.98, "percentage": 9.84, "elapsed_time": "1:05:18", "remaining_time": "9:58:10"}
286
+ {"current_steps": 2730, "total_steps": 27630, "loss": 0.9954, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.764168614581655e-05, "epoch": 0.99, "percentage": 9.88, "elapsed_time": "1:05:27", "remaining_time": "9:57:03"}
287
+ {"current_steps": 2740, "total_steps": 27630, "loss": 1.0029, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.762438890469304e-05, "epoch": 0.99, "percentage": 9.92, "elapsed_time": "1:05:36", "remaining_time": "9:56:00"}
288
+ {"current_steps": 2750, "total_steps": 27630, "loss": 1.0555, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.760703000459446e-05, "epoch": 1.0, "percentage": 9.95, "elapsed_time": "1:05:44", "remaining_time": "9:54:45"}
289
+ {"current_steps": 2760, "total_steps": 27630, "loss": 1.0394, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.758960946799528e-05, "epoch": 1.0, "percentage": 9.99, "elapsed_time": "1:05:53", "remaining_time": "9:53:44"}
290
+ {"current_steps": 2770, "total_steps": 27630, "loss": 0.9325, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.757212731744974e-05, "epoch": 1.0, "percentage": 10.03, "elapsed_time": "1:06:02", "remaining_time": "9:52:44"}
291
+ {"current_steps": 2780, "total_steps": 27630, "loss": 0.9711, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.755458357559186e-05, "epoch": 1.01, "percentage": 10.06, "elapsed_time": "1:06:09", "remaining_time": "9:51:26"}
292
+ {"current_steps": 2790, "total_steps": 27630, "loss": 0.9651, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.753697826513541e-05, "epoch": 1.01, "percentage": 10.1, "elapsed_time": "1:06:17", "remaining_time": "9:50:13"}
293
+ {"current_steps": 2800, "total_steps": 27630, "loss": 0.9686, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.751931140887387e-05, "epoch": 1.01, "percentage": 10.13, "elapsed_time": "1:06:25", "remaining_time": "9:49:02"}
294
+ {"current_steps": 2800, "total_steps": 27630, "loss": null, "eval_loss": 1.0086077451705933, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.01, "percentage": 10.13, "elapsed_time": "1:06:25", "remaining_time": "9:49:02"}
295
+ {"current_steps": 2810, "total_steps": 27630, "loss": 0.9267, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.750158302968039e-05, "epoch": 1.02, "percentage": 10.17, "elapsed_time": "1:08:40", "remaining_time": "10:06:36"}
296
+ {"current_steps": 2820, "total_steps": 27630, "loss": 1.0193, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.748379315050778e-05, "epoch": 1.02, "percentage": 10.21, "elapsed_time": "1:08:48", "remaining_time": "10:05:18"}
297
+ {"current_steps": 2830, "total_steps": 27630, "loss": 0.8893, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.74659417943885e-05, "epoch": 1.02, "percentage": 10.24, "elapsed_time": "1:08:56", "remaining_time": "10:04:05"}
298
+ {"current_steps": 2840, "total_steps": 27630, "loss": 0.937, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.744802898443456e-05, "epoch": 1.03, "percentage": 10.28, "elapsed_time": "1:09:04", "remaining_time": "10:02:58"}
299
+ {"current_steps": 2850, "total_steps": 27630, "loss": 0.949, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.743005474383755e-05, "epoch": 1.03, "percentage": 10.31, "elapsed_time": "1:09:12", "remaining_time": "10:01:46"}
300
+ {"current_steps": 2860, "total_steps": 27630, "loss": 0.9897, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.741201909586861e-05, "epoch": 1.04, "percentage": 10.35, "elapsed_time": "1:09:20", "remaining_time": "10:00:31"}
301
+ {"current_steps": 2870, "total_steps": 27630, "loss": 0.9393, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.739392206387838e-05, "epoch": 1.04, "percentage": 10.39, "elapsed_time": "1:09:29", "remaining_time": "9:59:28"}
302
+ {"current_steps": 2880, "total_steps": 27630, "loss": 0.9365, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.737576367129694e-05, "epoch": 1.04, "percentage": 10.42, "elapsed_time": "1:09:37", "remaining_time": "9:58:16"}
303
+ {"current_steps": 2890, "total_steps": 27630, "loss": 1.0074, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.735754394163386e-05, "epoch": 1.05, "percentage": 10.46, "elapsed_time": "1:09:45", "remaining_time": "9:57:07"}
304
+ {"current_steps": 2900, "total_steps": 27630, "loss": 0.9682, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.73392628984781e-05, "epoch": 1.05, "percentage": 10.5, "elapsed_time": "1:09:53", "remaining_time": "9:56:02"}
305
+ {"current_steps": 2910, "total_steps": 27630, "loss": 0.9753, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.732092056549799e-05, "epoch": 1.05, "percentage": 10.53, "elapsed_time": "1:10:01", "remaining_time": "9:54:54"}
306
+ {"current_steps": 2920, "total_steps": 27630, "loss": 0.926, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.730251696644122e-05, "epoch": 1.06, "percentage": 10.57, "elapsed_time": "1:10:08", "remaining_time": "9:53:37"}
307
+ {"current_steps": 2930, "total_steps": 27630, "loss": 0.9993, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.728405212513483e-05, "epoch": 1.06, "percentage": 10.6, "elapsed_time": "1:10:16", "remaining_time": "9:52:23"}
308
+ {"current_steps": 2940, "total_steps": 27630, "loss": 0.9879, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.726552606548512e-05, "epoch": 1.06, "percentage": 10.64, "elapsed_time": "1:10:24", "remaining_time": "9:51:15"}
309
+ {"current_steps": 2950, "total_steps": 27630, "loss": 0.9626, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.724693881147761e-05, "epoch": 1.07, "percentage": 10.68, "elapsed_time": "1:10:32", "remaining_time": "9:50:13"}
310
+ {"current_steps": 2960, "total_steps": 27630, "loss": 0.9767, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.722829038717717e-05, "epoch": 1.07, "percentage": 10.71, "elapsed_time": "1:10:40", "remaining_time": "9:48:59"}
311
+ {"current_steps": 2970, "total_steps": 27630, "loss": 0.9357, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.720958081672773e-05, "epoch": 1.07, "percentage": 10.75, "elapsed_time": "1:10:47", "remaining_time": "9:47:48"}
312
+ {"current_steps": 2980, "total_steps": 27630, "loss": 0.9705, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.719081012435247e-05, "epoch": 1.08, "percentage": 10.79, "elapsed_time": "1:10:54", "remaining_time": "9:46:35"}
313
+ {"current_steps": 2990, "total_steps": 27630, "loss": 0.9727, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.717197833435367e-05, "epoch": 1.08, "percentage": 10.82, "elapsed_time": "1:11:02", "remaining_time": "9:45:28"}
314
+ {"current_steps": 3000, "total_steps": 27630, "loss": 0.9767, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.715308547111273e-05, "epoch": 1.09, "percentage": 10.86, "elapsed_time": "1:11:10", "remaining_time": "9:44:21"}
315
+ {"current_steps": 3000, "total_steps": 27630, "loss": null, "eval_loss": 1.014098048210144, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.09, "percentage": 10.86, "elapsed_time": "1:11:10", "remaining_time": "9:44:21"}
316
+ {"current_steps": 3010, "total_steps": 27630, "loss": 0.9605, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.713413155909009e-05, "epoch": 1.09, "percentage": 10.89, "elapsed_time": "1:13:25", "remaining_time": "10:00:32"}
317
+ {"current_steps": 3020, "total_steps": 27630, "loss": 0.9611, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.711511662282527e-05, "epoch": 1.09, "percentage": 10.93, "elapsed_time": "1:13:34", "remaining_time": "9:59:30"}
318
+ {"current_steps": 3030, "total_steps": 27630, "loss": 0.9222, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.709604068693679e-05, "epoch": 1.1, "percentage": 10.97, "elapsed_time": "1:13:43", "remaining_time": "9:58:31"}
319
+ {"current_steps": 3040, "total_steps": 27630, "loss": 0.9369, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.707690377612211e-05, "epoch": 1.1, "percentage": 11.0, "elapsed_time": "1:13:51", "remaining_time": "9:57:28"}
320
+ {"current_steps": 3050, "total_steps": 27630, "loss": 0.8864, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.705770591515768e-05, "epoch": 1.1, "percentage": 11.04, "elapsed_time": "1:14:00", "remaining_time": "9:56:26"}
321
+ {"current_steps": 3060, "total_steps": 27630, "loss": 0.9753, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.703844712889884e-05, "epoch": 1.11, "percentage": 11.07, "elapsed_time": "1:14:09", "remaining_time": "9:55:22"}
322
+ {"current_steps": 3070, "total_steps": 27630, "loss": 0.9233, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.701912744227979e-05, "epoch": 1.11, "percentage": 11.11, "elapsed_time": "1:14:16", "remaining_time": "9:54:10"}
323
+ {"current_steps": 3080, "total_steps": 27630, "loss": 0.987, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.699974688031363e-05, "epoch": 1.11, "percentage": 11.15, "elapsed_time": "1:14:23", "remaining_time": "9:52:59"}
324
+ {"current_steps": 3090, "total_steps": 27630, "loss": 0.8833, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.69803054680922e-05, "epoch": 1.12, "percentage": 11.18, "elapsed_time": "1:14:32", "remaining_time": "9:51:55"}
325
+ {"current_steps": 3100, "total_steps": 27630, "loss": 0.9894, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.696080323078621e-05, "epoch": 1.12, "percentage": 11.22, "elapsed_time": "1:14:39", "remaining_time": "9:50:49"}
326
+ {"current_steps": 3110, "total_steps": 27630, "loss": 0.9417, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.694124019364505e-05, "epoch": 1.13, "percentage": 11.26, "elapsed_time": "1:14:47", "remaining_time": "9:49:42"}
327
+ {"current_steps": 3120, "total_steps": 27630, "loss": 0.9251, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.692161638199686e-05, "epoch": 1.13, "percentage": 11.29, "elapsed_time": "1:14:55", "remaining_time": "9:48:35"}
328
+ {"current_steps": 3130, "total_steps": 27630, "loss": 0.9447, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.690193182124844e-05, "epoch": 1.13, "percentage": 11.33, "elapsed_time": "1:15:04", "remaining_time": "9:47:40"}
329
+ {"current_steps": 3140, "total_steps": 27630, "loss": 0.9984, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.68821865368853e-05, "epoch": 1.14, "percentage": 11.36, "elapsed_time": "1:15:13", "remaining_time": "9:46:43"}
330
+ {"current_steps": 3150, "total_steps": 27630, "loss": 0.9422, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.686238055447148e-05, "epoch": 1.14, "percentage": 11.4, "elapsed_time": "1:15:22", "remaining_time": "9:45:45"}
331
+ {"current_steps": 3160, "total_steps": 27630, "loss": 0.9199, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.684251389964967e-05, "epoch": 1.14, "percentage": 11.44, "elapsed_time": "1:15:29", "remaining_time": "9:44:37"}
332
+ {"current_steps": 3170, "total_steps": 27630, "loss": 0.9249, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.68225865981411e-05, "epoch": 1.15, "percentage": 11.47, "elapsed_time": "1:15:40", "remaining_time": "9:43:52"}
333
+ {"current_steps": 3180, "total_steps": 27630, "loss": 0.947, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.680259867574552e-05, "epoch": 1.15, "percentage": 11.51, "elapsed_time": "1:15:49", "remaining_time": "9:42:59"}
334
+ {"current_steps": 3190, "total_steps": 27630, "loss": 0.9956, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.678255015834112e-05, "epoch": 1.15, "percentage": 11.55, "elapsed_time": "1:15:57", "remaining_time": "9:41:53"}
335
+ {"current_steps": 3200, "total_steps": 27630, "loss": 0.9494, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.676244107188463e-05, "epoch": 1.16, "percentage": 11.58, "elapsed_time": "1:16:06", "remaining_time": "9:41:02"}
336
+ {"current_steps": 3200, "total_steps": 27630, "loss": null, "eval_loss": 1.0160499811172485, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.16, "percentage": 11.58, "elapsed_time": "1:16:06", "remaining_time": "9:41:02"}
337
+ {"current_steps": 3200, "total_steps": 27630, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.16, "percentage": 11.58, "elapsed_time": "1:16:06", "remaining_time": "9:41:02"}
338
+ {"current_steps": 488, "total_steps": 488, "loss": null, "eval_loss": 1.0082145929336548, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.16, "percentage": 100.0, "elapsed_time": "1:20:24", "remaining_time": "0:00:00"}
llama2_13b_peft/alpaca/trainer_state.json ADDED
@@ -0,0 +1,2398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.0082145929336548,
3
+ "best_model_checkpoint": "ckpt/llama2_13b_fuze15_no_sys/alpaca_no_sys/checkpoint-2000",
4
+ "epoch": 1.158161418747738,
5
+ "eval_steps": 200,
6
+ "global_step": 3200,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "grad_norm": 0.2189224660396576,
14
+ "learning_rate": 5e-05,
15
+ "loss": 1.4369,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.01,
20
+ "grad_norm": 0.4817655384540558,
21
+ "learning_rate": 0.0001,
22
+ "loss": 1.3624,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.01,
27
+ "grad_norm": 0.35323551297187805,
28
+ "learning_rate": 9.999996763266864e-05,
29
+ "loss": 1.1589,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.01,
34
+ "grad_norm": 0.2697048485279083,
35
+ "learning_rate": 9.999987053071647e-05,
36
+ "loss": 1.1103,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.02,
41
+ "grad_norm": 0.34059372544288635,
42
+ "learning_rate": 9.99997086942692e-05,
43
+ "loss": 1.0601,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.02,
48
+ "grad_norm": 0.2907443344593048,
49
+ "learning_rate": 9.999948212353635e-05,
50
+ "loss": 1.0302,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.03,
55
+ "grad_norm": 0.4002208113670349,
56
+ "learning_rate": 9.999919081881129e-05,
57
+ "loss": 1.114,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.03,
62
+ "grad_norm": 0.4364459216594696,
63
+ "learning_rate": 9.999883478047113e-05,
64
+ "loss": 1.0913,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.03,
69
+ "grad_norm": 0.322396844625473,
70
+ "learning_rate": 9.999841400897687e-05,
71
+ "loss": 1.0778,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.04,
76
+ "grad_norm": 0.5678238868713379,
77
+ "learning_rate": 9.999792850487325e-05,
78
+ "loss": 1.0493,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.04,
83
+ "grad_norm": 0.2919568717479706,
84
+ "learning_rate": 9.999737826878886e-05,
85
+ "loss": 1.0249,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.04,
90
+ "grad_norm": 0.3787660300731659,
91
+ "learning_rate": 9.99967633014361e-05,
92
+ "loss": 1.0594,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.05,
97
+ "grad_norm": 0.33062055706977844,
98
+ "learning_rate": 9.999608360361113e-05,
99
+ "loss": 1.0527,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.05,
104
+ "grad_norm": 0.3306855857372284,
105
+ "learning_rate": 9.999533917619399e-05,
106
+ "loss": 1.0051,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.05,
111
+ "grad_norm": 0.41762664914131165,
112
+ "learning_rate": 9.999453002014846e-05,
113
+ "loss": 0.9906,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.06,
118
+ "grad_norm": 0.291189044713974,
119
+ "learning_rate": 9.999365613652217e-05,
120
+ "loss": 1.0197,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.06,
125
+ "grad_norm": 0.30276551842689514,
126
+ "learning_rate": 9.999271752644649e-05,
127
+ "loss": 1.0356,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.07,
132
+ "grad_norm": 0.25866344571113586,
133
+ "learning_rate": 9.999171419113666e-05,
134
+ "loss": 1.0332,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.07,
139
+ "grad_norm": 0.1927756369113922,
140
+ "learning_rate": 9.999064613189171e-05,
141
+ "loss": 1.0126,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.07,
146
+ "grad_norm": 0.2776283621788025,
147
+ "learning_rate": 9.998951335009442e-05,
148
+ "loss": 1.0429,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.07,
153
+ "eval_loss": 1.029819130897522,
154
+ "eval_runtime": 124.6792,
155
+ "eval_samples_per_second": 62.569,
156
+ "eval_steps_per_second": 3.914,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 0.08,
161
+ "grad_norm": 0.320551335811615,
162
+ "learning_rate": 9.998831584721141e-05,
163
+ "loss": 1.0431,
164
+ "step": 210
165
+ },
166
+ {
167
+ "epoch": 0.08,
168
+ "grad_norm": 0.46670058369636536,
169
+ "learning_rate": 9.998705362479307e-05,
170
+ "loss": 1.0374,
171
+ "step": 220
172
+ },
173
+ {
174
+ "epoch": 0.08,
175
+ "grad_norm": 0.30959388613700867,
176
+ "learning_rate": 9.99857266844736e-05,
177
+ "loss": 1.1065,
178
+ "step": 230
179
+ },
180
+ {
181
+ "epoch": 0.09,
182
+ "grad_norm": 0.3016811013221741,
183
+ "learning_rate": 9.998433502797095e-05,
184
+ "loss": 1.1105,
185
+ "step": 240
186
+ },
187
+ {
188
+ "epoch": 0.09,
189
+ "grad_norm": 0.356992244720459,
190
+ "learning_rate": 9.998287865708694e-05,
191
+ "loss": 0.9839,
192
+ "step": 250
193
+ },
194
+ {
195
+ "epoch": 0.09,
196
+ "grad_norm": 0.29836413264274597,
197
+ "learning_rate": 9.998135757370708e-05,
198
+ "loss": 1.0401,
199
+ "step": 260
200
+ },
201
+ {
202
+ "epoch": 0.1,
203
+ "grad_norm": 0.4305395483970642,
204
+ "learning_rate": 9.997977177980074e-05,
205
+ "loss": 1.0461,
206
+ "step": 270
207
+ },
208
+ {
209
+ "epoch": 0.1,
210
+ "grad_norm": 0.2959505021572113,
211
+ "learning_rate": 9.9978121277421e-05,
212
+ "loss": 1.0662,
213
+ "step": 280
214
+ },
215
+ {
216
+ "epoch": 0.1,
217
+ "grad_norm": 0.2577110826969147,
218
+ "learning_rate": 9.99764060687048e-05,
219
+ "loss": 1.0736,
220
+ "step": 290
221
+ },
222
+ {
223
+ "epoch": 0.11,
224
+ "grad_norm": 0.2583490014076233,
225
+ "learning_rate": 9.997462615587276e-05,
226
+ "loss": 0.9963,
227
+ "step": 300
228
+ },
229
+ {
230
+ "epoch": 0.11,
231
+ "grad_norm": 0.29901596903800964,
232
+ "learning_rate": 9.997278154122935e-05,
233
+ "loss": 1.044,
234
+ "step": 310
235
+ },
236
+ {
237
+ "epoch": 0.12,
238
+ "grad_norm": 0.24256502091884613,
239
+ "learning_rate": 9.997087222716278e-05,
240
+ "loss": 1.0713,
241
+ "step": 320
242
+ },
243
+ {
244
+ "epoch": 0.12,
245
+ "grad_norm": 0.267166405916214,
246
+ "learning_rate": 9.996889821614502e-05,
247
+ "loss": 1.0721,
248
+ "step": 330
249
+ },
250
+ {
251
+ "epoch": 0.12,
252
+ "grad_norm": 0.21612702310085297,
253
+ "learning_rate": 9.996685951073182e-05,
254
+ "loss": 1.0414,
255
+ "step": 340
256
+ },
257
+ {
258
+ "epoch": 0.13,
259
+ "grad_norm": 0.3107874095439911,
260
+ "learning_rate": 9.996475611356264e-05,
261
+ "loss": 0.9856,
262
+ "step": 350
263
+ },
264
+ {
265
+ "epoch": 0.13,
266
+ "grad_norm": 0.27626070380210876,
267
+ "learning_rate": 9.996258802736079e-05,
268
+ "loss": 1.0121,
269
+ "step": 360
270
+ },
271
+ {
272
+ "epoch": 0.13,
273
+ "grad_norm": 0.2957281172275543,
274
+ "learning_rate": 9.996035525493322e-05,
275
+ "loss": 1.0785,
276
+ "step": 370
277
+ },
278
+ {
279
+ "epoch": 0.14,
280
+ "grad_norm": 0.3168753981590271,
281
+ "learning_rate": 9.995805779917073e-05,
282
+ "loss": 0.996,
283
+ "step": 380
284
+ },
285
+ {
286
+ "epoch": 0.14,
287
+ "grad_norm": 0.24823521077632904,
288
+ "learning_rate": 9.99556956630478e-05,
289
+ "loss": 1.0557,
290
+ "step": 390
291
+ },
292
+ {
293
+ "epoch": 0.14,
294
+ "grad_norm": 0.3291969895362854,
295
+ "learning_rate": 9.995326884962268e-05,
296
+ "loss": 1.0505,
297
+ "step": 400
298
+ },
299
+ {
300
+ "epoch": 0.14,
301
+ "eval_loss": 1.023820400238037,
302
+ "eval_runtime": 124.7265,
303
+ "eval_samples_per_second": 62.545,
304
+ "eval_steps_per_second": 3.913,
305
+ "step": 400
306
+ },
307
+ {
308
+ "epoch": 0.15,
309
+ "grad_norm": 0.3567464351654053,
310
+ "learning_rate": 9.995077736203733e-05,
311
+ "loss": 0.9919,
312
+ "step": 410
313
+ },
314
+ {
315
+ "epoch": 0.15,
316
+ "grad_norm": 0.2938403785228729,
317
+ "learning_rate": 9.99482212035175e-05,
318
+ "loss": 1.0736,
319
+ "step": 420
320
+ },
321
+ {
322
+ "epoch": 0.16,
323
+ "grad_norm": 0.27481499314308167,
324
+ "learning_rate": 9.994560037737259e-05,
325
+ "loss": 1.0633,
326
+ "step": 430
327
+ },
328
+ {
329
+ "epoch": 0.16,
330
+ "grad_norm": 0.34652218222618103,
331
+ "learning_rate": 9.994291488699579e-05,
332
+ "loss": 1.049,
333
+ "step": 440
334
+ },
335
+ {
336
+ "epoch": 0.16,
337
+ "grad_norm": 0.23733928799629211,
338
+ "learning_rate": 9.994016473586398e-05,
339
+ "loss": 1.0022,
340
+ "step": 450
341
+ },
342
+ {
343
+ "epoch": 0.17,
344
+ "grad_norm": 0.2666071653366089,
345
+ "learning_rate": 9.993734992753777e-05,
346
+ "loss": 1.0076,
347
+ "step": 460
348
+ },
349
+ {
350
+ "epoch": 0.17,
351
+ "grad_norm": 0.22843866050243378,
352
+ "learning_rate": 9.993447046566146e-05,
353
+ "loss": 1.0298,
354
+ "step": 470
355
+ },
356
+ {
357
+ "epoch": 0.17,
358
+ "grad_norm": 0.4334356486797333,
359
+ "learning_rate": 9.993152635396308e-05,
360
+ "loss": 1.0635,
361
+ "step": 480
362
+ },
363
+ {
364
+ "epoch": 0.18,
365
+ "grad_norm": 0.25845977663993835,
366
+ "learning_rate": 9.992851759625433e-05,
367
+ "loss": 1.0183,
368
+ "step": 490
369
+ },
370
+ {
371
+ "epoch": 0.18,
372
+ "grad_norm": 0.26029086112976074,
373
+ "learning_rate": 9.992544419643066e-05,
374
+ "loss": 0.963,
375
+ "step": 500
376
+ },
377
+ {
378
+ "epoch": 0.18,
379
+ "grad_norm": 0.23090577125549316,
380
+ "learning_rate": 9.992230615847116e-05,
381
+ "loss": 0.9691,
382
+ "step": 510
383
+ },
384
+ {
385
+ "epoch": 0.19,
386
+ "grad_norm": 0.2835213243961334,
387
+ "learning_rate": 9.991910348643865e-05,
388
+ "loss": 1.0309,
389
+ "step": 520
390
+ },
391
+ {
392
+ "epoch": 0.19,
393
+ "grad_norm": 0.2612157166004181,
394
+ "learning_rate": 9.991583618447958e-05,
395
+ "loss": 1.0232,
396
+ "step": 530
397
+ },
398
+ {
399
+ "epoch": 0.2,
400
+ "grad_norm": 0.43860122561454773,
401
+ "learning_rate": 9.99125042568241e-05,
402
+ "loss": 1.0308,
403
+ "step": 540
404
+ },
405
+ {
406
+ "epoch": 0.2,
407
+ "grad_norm": 0.2504933476448059,
408
+ "learning_rate": 9.990910770778606e-05,
409
+ "loss": 1.0581,
410
+ "step": 550
411
+ },
412
+ {
413
+ "epoch": 0.2,
414
+ "grad_norm": 0.2778143286705017,
415
+ "learning_rate": 9.990564654176293e-05,
416
+ "loss": 0.958,
417
+ "step": 560
418
+ },
419
+ {
420
+ "epoch": 0.21,
421
+ "grad_norm": 0.29035818576812744,
422
+ "learning_rate": 9.990212076323586e-05,
423
+ "loss": 1.0258,
424
+ "step": 570
425
+ },
426
+ {
427
+ "epoch": 0.21,
428
+ "grad_norm": 0.307841032743454,
429
+ "learning_rate": 9.989853037676965e-05,
430
+ "loss": 1.0724,
431
+ "step": 580
432
+ },
433
+ {
434
+ "epoch": 0.21,
435
+ "grad_norm": 0.3011914789676666,
436
+ "learning_rate": 9.989487538701279e-05,
437
+ "loss": 0.9847,
438
+ "step": 590
439
+ },
440
+ {
441
+ "epoch": 0.22,
442
+ "grad_norm": 0.27195674180984497,
443
+ "learning_rate": 9.989115579869732e-05,
444
+ "loss": 1.044,
445
+ "step": 600
446
+ },
447
+ {
448
+ "epoch": 0.22,
449
+ "eval_loss": 1.0194298028945923,
450
+ "eval_runtime": 124.7334,
451
+ "eval_samples_per_second": 62.541,
452
+ "eval_steps_per_second": 3.912,
453
+ "step": 600
454
+ },
455
+ {
456
+ "epoch": 0.22,
457
+ "grad_norm": 0.2725551724433899,
458
+ "learning_rate": 9.988737161663898e-05,
459
+ "loss": 1.0244,
460
+ "step": 610
461
+ },
462
+ {
463
+ "epoch": 0.22,
464
+ "grad_norm": 0.2821577787399292,
465
+ "learning_rate": 9.988352284573713e-05,
466
+ "loss": 1.0254,
467
+ "step": 620
468
+ },
469
+ {
470
+ "epoch": 0.23,
471
+ "grad_norm": 0.3664613664150238,
472
+ "learning_rate": 9.987960949097475e-05,
473
+ "loss": 1.1093,
474
+ "step": 630
475
+ },
476
+ {
477
+ "epoch": 0.23,
478
+ "grad_norm": 0.3072526156902313,
479
+ "learning_rate": 9.987563155741842e-05,
480
+ "loss": 1.0196,
481
+ "step": 640
482
+ },
483
+ {
484
+ "epoch": 0.24,
485
+ "grad_norm": 0.24550805985927582,
486
+ "learning_rate": 9.987158905021836e-05,
487
+ "loss": 1.012,
488
+ "step": 650
489
+ },
490
+ {
491
+ "epoch": 0.24,
492
+ "grad_norm": 0.2521149814128876,
493
+ "learning_rate": 9.986748197460837e-05,
494
+ "loss": 1.0219,
495
+ "step": 660
496
+ },
497
+ {
498
+ "epoch": 0.24,
499
+ "grad_norm": 0.34175044298171997,
500
+ "learning_rate": 9.986331033590586e-05,
501
+ "loss": 1.015,
502
+ "step": 670
503
+ },
504
+ {
505
+ "epoch": 0.25,
506
+ "grad_norm": 0.30103522539138794,
507
+ "learning_rate": 9.98590741395118e-05,
508
+ "loss": 1.1113,
509
+ "step": 680
510
+ },
511
+ {
512
+ "epoch": 0.25,
513
+ "grad_norm": 0.2344699651002884,
514
+ "learning_rate": 9.985477339091078e-05,
515
+ "loss": 1.0456,
516
+ "step": 690
517
+ },
518
+ {
519
+ "epoch": 0.25,
520
+ "grad_norm": 0.26754796504974365,
521
+ "learning_rate": 9.985040809567097e-05,
522
+ "loss": 1.0102,
523
+ "step": 700
524
+ },
525
+ {
526
+ "epoch": 0.26,
527
+ "grad_norm": 0.31665658950805664,
528
+ "learning_rate": 9.984597825944405e-05,
529
+ "loss": 1.0057,
530
+ "step": 710
531
+ },
532
+ {
533
+ "epoch": 0.26,
534
+ "grad_norm": 0.2716057300567627,
535
+ "learning_rate": 9.984148388796532e-05,
536
+ "loss": 0.9937,
537
+ "step": 720
538
+ },
539
+ {
540
+ "epoch": 0.26,
541
+ "grad_norm": 0.2589300274848938,
542
+ "learning_rate": 9.983692498705361e-05,
543
+ "loss": 0.9937,
544
+ "step": 730
545
+ },
546
+ {
547
+ "epoch": 0.27,
548
+ "grad_norm": 0.2215312272310257,
549
+ "learning_rate": 9.983230156261132e-05,
550
+ "loss": 1.0205,
551
+ "step": 740
552
+ },
553
+ {
554
+ "epoch": 0.27,
555
+ "grad_norm": 0.26202231645584106,
556
+ "learning_rate": 9.982761362062432e-05,
557
+ "loss": 1.0486,
558
+ "step": 750
559
+ },
560
+ {
561
+ "epoch": 0.28,
562
+ "grad_norm": 0.21432209014892578,
563
+ "learning_rate": 9.982286116716208e-05,
564
+ "loss": 1.0679,
565
+ "step": 760
566
+ },
567
+ {
568
+ "epoch": 0.28,
569
+ "grad_norm": 0.4230276048183441,
570
+ "learning_rate": 9.98180442083776e-05,
571
+ "loss": 1.0051,
572
+ "step": 770
573
+ },
574
+ {
575
+ "epoch": 0.28,
576
+ "grad_norm": 0.26559358835220337,
577
+ "learning_rate": 9.981316275050731e-05,
578
+ "loss": 1.0398,
579
+ "step": 780
580
+ },
581
+ {
582
+ "epoch": 0.29,
583
+ "grad_norm": 0.2559758722782135,
584
+ "learning_rate": 9.980821679987125e-05,
585
+ "loss": 1.0365,
586
+ "step": 790
587
+ },
588
+ {
589
+ "epoch": 0.29,
590
+ "grad_norm": 0.34101855754852295,
591
+ "learning_rate": 9.980320636287285e-05,
592
+ "loss": 1.0169,
593
+ "step": 800
594
+ },
595
+ {
596
+ "epoch": 0.29,
597
+ "eval_loss": 1.0172123908996582,
598
+ "eval_runtime": 124.7169,
599
+ "eval_samples_per_second": 62.55,
600
+ "eval_steps_per_second": 3.913,
601
+ "step": 800
602
+ },
603
+ {
604
+ "epoch": 0.29,
605
+ "grad_norm": 0.3401408791542053,
606
+ "learning_rate": 9.979813144599915e-05,
607
+ "loss": 1.0165,
608
+ "step": 810
609
+ },
610
+ {
611
+ "epoch": 0.3,
612
+ "grad_norm": 0.34302470088005066,
613
+ "learning_rate": 9.979299205582057e-05,
614
+ "loss": 1.0314,
615
+ "step": 820
616
+ },
617
+ {
618
+ "epoch": 0.3,
619
+ "grad_norm": 0.2908473610877991,
620
+ "learning_rate": 9.978778819899109e-05,
621
+ "loss": 0.9779,
622
+ "step": 830
623
+ },
624
+ {
625
+ "epoch": 0.3,
626
+ "grad_norm": 0.229986771941185,
627
+ "learning_rate": 9.978251988224804e-05,
628
+ "loss": 0.9564,
629
+ "step": 840
630
+ },
631
+ {
632
+ "epoch": 0.31,
633
+ "grad_norm": 0.441243052482605,
634
+ "learning_rate": 9.977718711241233e-05,
635
+ "loss": 1.0275,
636
+ "step": 850
637
+ },
638
+ {
639
+ "epoch": 0.31,
640
+ "grad_norm": 0.2620699107646942,
641
+ "learning_rate": 9.977178989638822e-05,
642
+ "loss": 1.0293,
643
+ "step": 860
644
+ },
645
+ {
646
+ "epoch": 0.31,
647
+ "grad_norm": 0.27257561683654785,
648
+ "learning_rate": 9.97663282411635e-05,
649
+ "loss": 1.0508,
650
+ "step": 870
651
+ },
652
+ {
653
+ "epoch": 0.32,
654
+ "grad_norm": 0.306587278842926,
655
+ "learning_rate": 9.97608021538093e-05,
656
+ "loss": 0.9949,
657
+ "step": 880
658
+ },
659
+ {
660
+ "epoch": 0.32,
661
+ "grad_norm": 0.30046141147613525,
662
+ "learning_rate": 9.97552116414802e-05,
663
+ "loss": 1.0752,
664
+ "step": 890
665
+ },
666
+ {
667
+ "epoch": 0.33,
668
+ "grad_norm": 0.2749102711677551,
669
+ "learning_rate": 9.974955671141424e-05,
670
+ "loss": 0.9947,
671
+ "step": 900
672
+ },
673
+ {
674
+ "epoch": 0.33,
675
+ "grad_norm": 0.38608163595199585,
676
+ "learning_rate": 9.974383737093279e-05,
677
+ "loss": 1.0362,
678
+ "step": 910
679
+ },
680
+ {
681
+ "epoch": 0.33,
682
+ "grad_norm": 0.24529774487018585,
683
+ "learning_rate": 9.973805362744064e-05,
684
+ "loss": 1.0469,
685
+ "step": 920
686
+ },
687
+ {
688
+ "epoch": 0.34,
689
+ "grad_norm": 0.33143192529678345,
690
+ "learning_rate": 9.973220548842598e-05,
691
+ "loss": 0.9705,
692
+ "step": 930
693
+ },
694
+ {
695
+ "epoch": 0.34,
696
+ "grad_norm": 0.3112998306751251,
697
+ "learning_rate": 9.972629296146035e-05,
698
+ "loss": 0.9956,
699
+ "step": 940
700
+ },
701
+ {
702
+ "epoch": 0.34,
703
+ "grad_norm": 0.32970279455184937,
704
+ "learning_rate": 9.972031605419864e-05,
705
+ "loss": 1.0232,
706
+ "step": 950
707
+ },
708
+ {
709
+ "epoch": 0.35,
710
+ "grad_norm": 0.256101131439209,
711
+ "learning_rate": 9.971427477437914e-05,
712
+ "loss": 1.0471,
713
+ "step": 960
714
+ },
715
+ {
716
+ "epoch": 0.35,
717
+ "grad_norm": 0.4258672595024109,
718
+ "learning_rate": 9.970816912982344e-05,
719
+ "loss": 0.9652,
720
+ "step": 970
721
+ },
722
+ {
723
+ "epoch": 0.35,
724
+ "grad_norm": 0.3143826425075531,
725
+ "learning_rate": 9.970199912843648e-05,
726
+ "loss": 0.9894,
727
+ "step": 980
728
+ },
729
+ {
730
+ "epoch": 0.36,
731
+ "grad_norm": 0.2868054509162903,
732
+ "learning_rate": 9.96957647782065e-05,
733
+ "loss": 1.0437,
734
+ "step": 990
735
+ },
736
+ {
737
+ "epoch": 0.36,
738
+ "grad_norm": 0.2594622075557709,
739
+ "learning_rate": 9.968946608720511e-05,
740
+ "loss": 1.02,
741
+ "step": 1000
742
+ },
743
+ {
744
+ "epoch": 0.36,
745
+ "eval_loss": 1.0154483318328857,
746
+ "eval_runtime": 124.672,
747
+ "eval_samples_per_second": 62.572,
748
+ "eval_steps_per_second": 3.914,
749
+ "step": 1000
750
+ },
751
+ {
752
+ "epoch": 0.37,
753
+ "grad_norm": 0.2359086573123932,
754
+ "learning_rate": 9.968310306358715e-05,
755
+ "loss": 1.0676,
756
+ "step": 1010
757
+ },
758
+ {
759
+ "epoch": 0.37,
760
+ "grad_norm": 0.22080975770950317,
761
+ "learning_rate": 9.967667571559081e-05,
762
+ "loss": 1.027,
763
+ "step": 1020
764
+ },
765
+ {
766
+ "epoch": 0.37,
767
+ "grad_norm": 0.3211756944656372,
768
+ "learning_rate": 9.967018405153749e-05,
769
+ "loss": 1.0004,
770
+ "step": 1030
771
+ },
772
+ {
773
+ "epoch": 0.38,
774
+ "grad_norm": 0.3681553602218628,
775
+ "learning_rate": 9.966362807983196e-05,
776
+ "loss": 1.0395,
777
+ "step": 1040
778
+ },
779
+ {
780
+ "epoch": 0.38,
781
+ "grad_norm": 0.3180038332939148,
782
+ "learning_rate": 9.965700780896216e-05,
783
+ "loss": 0.9948,
784
+ "step": 1050
785
+ },
786
+ {
787
+ "epoch": 0.38,
788
+ "grad_norm": 0.25071969628334045,
789
+ "learning_rate": 9.965032324749932e-05,
790
+ "loss": 1.0281,
791
+ "step": 1060
792
+ },
793
+ {
794
+ "epoch": 0.39,
795
+ "grad_norm": 0.2274983674287796,
796
+ "learning_rate": 9.964357440409789e-05,
797
+ "loss": 1.0094,
798
+ "step": 1070
799
+ },
800
+ {
801
+ "epoch": 0.39,
802
+ "grad_norm": 0.24825724959373474,
803
+ "learning_rate": 9.963676128749553e-05,
804
+ "loss": 1.0272,
805
+ "step": 1080
806
+ },
807
+ {
808
+ "epoch": 0.39,
809
+ "grad_norm": 0.3256381154060364,
810
+ "learning_rate": 9.96298839065132e-05,
811
+ "loss": 1.0191,
812
+ "step": 1090
813
+ },
814
+ {
815
+ "epoch": 0.4,
816
+ "grad_norm": 0.31695234775543213,
817
+ "learning_rate": 9.962294227005493e-05,
818
+ "loss": 1.08,
819
+ "step": 1100
820
+ },
821
+ {
822
+ "epoch": 0.4,
823
+ "grad_norm": 0.288083553314209,
824
+ "learning_rate": 9.961593638710804e-05,
825
+ "loss": 0.9954,
826
+ "step": 1110
827
+ },
828
+ {
829
+ "epoch": 0.41,
830
+ "grad_norm": 0.29730525612831116,
831
+ "learning_rate": 9.960886626674302e-05,
832
+ "loss": 1.071,
833
+ "step": 1120
834
+ },
835
+ {
836
+ "epoch": 0.41,
837
+ "grad_norm": 0.2090187519788742,
838
+ "learning_rate": 9.960173191811348e-05,
839
+ "loss": 0.9725,
840
+ "step": 1130
841
+ },
842
+ {
843
+ "epoch": 0.41,
844
+ "grad_norm": 0.2811983525753021,
845
+ "learning_rate": 9.959453335045622e-05,
846
+ "loss": 1.0071,
847
+ "step": 1140
848
+ },
849
+ {
850
+ "epoch": 0.42,
851
+ "grad_norm": 0.27806761860847473,
852
+ "learning_rate": 9.958727057309115e-05,
853
+ "loss": 1.0108,
854
+ "step": 1150
855
+ },
856
+ {
857
+ "epoch": 0.42,
858
+ "grad_norm": 0.2864569127559662,
859
+ "learning_rate": 9.957994359542138e-05,
860
+ "loss": 1.0495,
861
+ "step": 1160
862
+ },
863
+ {
864
+ "epoch": 0.42,
865
+ "grad_norm": 0.3440109193325043,
866
+ "learning_rate": 9.957255242693308e-05,
867
+ "loss": 1.0015,
868
+ "step": 1170
869
+ },
870
+ {
871
+ "epoch": 0.43,
872
+ "grad_norm": 0.2824917435646057,
873
+ "learning_rate": 9.956509707719555e-05,
874
+ "loss": 1.0559,
875
+ "step": 1180
876
+ },
877
+ {
878
+ "epoch": 0.43,
879
+ "grad_norm": 0.3080492317676544,
880
+ "learning_rate": 9.955757755586119e-05,
881
+ "loss": 1.0134,
882
+ "step": 1190
883
+ },
884
+ {
885
+ "epoch": 0.43,
886
+ "grad_norm": 0.2890901565551758,
887
+ "learning_rate": 9.954999387266546e-05,
888
+ "loss": 0.9492,
889
+ "step": 1200
890
+ },
891
+ {
892
+ "epoch": 0.43,
893
+ "eval_loss": 1.0133627653121948,
894
+ "eval_runtime": 124.7104,
895
+ "eval_samples_per_second": 62.553,
896
+ "eval_steps_per_second": 3.913,
897
+ "step": 1200
898
+ },
899
+ {
900
+ "epoch": 0.44,
901
+ "grad_norm": 0.33987322449684143,
902
+ "learning_rate": 9.95423460374269e-05,
903
+ "loss": 0.9629,
904
+ "step": 1210
905
+ },
906
+ {
907
+ "epoch": 0.44,
908
+ "grad_norm": 0.29403063654899597,
909
+ "learning_rate": 9.953463406004713e-05,
910
+ "loss": 1.0384,
911
+ "step": 1220
912
+ },
913
+ {
914
+ "epoch": 0.45,
915
+ "grad_norm": 0.20130111277103424,
916
+ "learning_rate": 9.952685795051077e-05,
917
+ "loss": 1.0235,
918
+ "step": 1230
919
+ },
920
+ {
921
+ "epoch": 0.45,
922
+ "grad_norm": 0.1973690539598465,
923
+ "learning_rate": 9.951901771888552e-05,
924
+ "loss": 1.0395,
925
+ "step": 1240
926
+ },
927
+ {
928
+ "epoch": 0.45,
929
+ "grad_norm": 0.24519580602645874,
930
+ "learning_rate": 9.951111337532205e-05,
931
+ "loss": 1.0914,
932
+ "step": 1250
933
+ },
934
+ {
935
+ "epoch": 0.46,
936
+ "grad_norm": 0.2706618309020996,
937
+ "learning_rate": 9.950314493005408e-05,
938
+ "loss": 1.0714,
939
+ "step": 1260
940
+ },
941
+ {
942
+ "epoch": 0.46,
943
+ "grad_norm": 0.23367558419704437,
944
+ "learning_rate": 9.949511239339831e-05,
945
+ "loss": 1.0224,
946
+ "step": 1270
947
+ },
948
+ {
949
+ "epoch": 0.46,
950
+ "grad_norm": 0.30005407333374023,
951
+ "learning_rate": 9.948701577575439e-05,
952
+ "loss": 1.0152,
953
+ "step": 1280
954
+ },
955
+ {
956
+ "epoch": 0.47,
957
+ "grad_norm": 0.3130083382129669,
958
+ "learning_rate": 9.947885508760496e-05,
959
+ "loss": 0.8988,
960
+ "step": 1290
961
+ },
962
+ {
963
+ "epoch": 0.47,
964
+ "grad_norm": 0.23657679557800293,
965
+ "learning_rate": 9.94706303395156e-05,
966
+ "loss": 1.0242,
967
+ "step": 1300
968
+ },
969
+ {
970
+ "epoch": 0.47,
971
+ "grad_norm": 0.40966659784317017,
972
+ "learning_rate": 9.946234154213487e-05,
973
+ "loss": 1.0145,
974
+ "step": 1310
975
+ },
976
+ {
977
+ "epoch": 0.48,
978
+ "grad_norm": 0.35292962193489075,
979
+ "learning_rate": 9.94539887061942e-05,
980
+ "loss": 1.0197,
981
+ "step": 1320
982
+ },
983
+ {
984
+ "epoch": 0.48,
985
+ "grad_norm": 0.38793638348579407,
986
+ "learning_rate": 9.944557184250794e-05,
987
+ "loss": 1.0273,
988
+ "step": 1330
989
+ },
990
+ {
991
+ "epoch": 0.48,
992
+ "grad_norm": 0.27373677492141724,
993
+ "learning_rate": 9.943709096197335e-05,
994
+ "loss": 0.9561,
995
+ "step": 1340
996
+ },
997
+ {
998
+ "epoch": 0.49,
999
+ "grad_norm": 0.24536257982254028,
1000
+ "learning_rate": 9.942854607557057e-05,
1001
+ "loss": 0.9678,
1002
+ "step": 1350
1003
+ },
1004
+ {
1005
+ "epoch": 0.49,
1006
+ "grad_norm": 0.4609609842300415,
1007
+ "learning_rate": 9.941993719436262e-05,
1008
+ "loss": 1.0429,
1009
+ "step": 1360
1010
+ },
1011
+ {
1012
+ "epoch": 0.5,
1013
+ "grad_norm": 0.27118805050849915,
1014
+ "learning_rate": 9.941126432949535e-05,
1015
+ "loss": 1.0506,
1016
+ "step": 1370
1017
+ },
1018
+ {
1019
+ "epoch": 0.5,
1020
+ "grad_norm": 0.27538400888442993,
1021
+ "learning_rate": 9.940252749219746e-05,
1022
+ "loss": 1.0326,
1023
+ "step": 1380
1024
+ },
1025
+ {
1026
+ "epoch": 0.5,
1027
+ "grad_norm": 0.2451954036951065,
1028
+ "learning_rate": 9.939372669378048e-05,
1029
+ "loss": 1.0413,
1030
+ "step": 1390
1031
+ },
1032
+ {
1033
+ "epoch": 0.51,
1034
+ "grad_norm": 0.2622232437133789,
1035
+ "learning_rate": 9.938486194563875e-05,
1036
+ "loss": 1.0051,
1037
+ "step": 1400
1038
+ },
1039
+ {
1040
+ "epoch": 0.51,
1041
+ "eval_loss": 1.011703372001648,
1042
+ "eval_runtime": 124.6726,
1043
+ "eval_samples_per_second": 62.572,
1044
+ "eval_steps_per_second": 3.914,
1045
+ "step": 1400
1046
+ },
1047
+ {
1048
+ "epoch": 0.51,
1049
+ "grad_norm": 0.2616746425628662,
1050
+ "learning_rate": 9.937593325924937e-05,
1051
+ "loss": 1.0277,
1052
+ "step": 1410
1053
+ },
1054
+ {
1055
+ "epoch": 0.51,
1056
+ "grad_norm": 0.2952045202255249,
1057
+ "learning_rate": 9.936694064617227e-05,
1058
+ "loss": 0.9802,
1059
+ "step": 1420
1060
+ },
1061
+ {
1062
+ "epoch": 0.52,
1063
+ "grad_norm": 0.2611790895462036,
1064
+ "learning_rate": 9.935788411805011e-05,
1065
+ "loss": 0.9811,
1066
+ "step": 1430
1067
+ },
1068
+ {
1069
+ "epoch": 0.52,
1070
+ "grad_norm": 0.3291374742984772,
1071
+ "learning_rate": 9.934876368660836e-05,
1072
+ "loss": 0.9972,
1073
+ "step": 1440
1074
+ },
1075
+ {
1076
+ "epoch": 0.52,
1077
+ "grad_norm": 0.32888704538345337,
1078
+ "learning_rate": 9.933957936365515e-05,
1079
+ "loss": 1.1006,
1080
+ "step": 1450
1081
+ },
1082
+ {
1083
+ "epoch": 0.53,
1084
+ "grad_norm": 0.20011785626411438,
1085
+ "learning_rate": 9.933033116108134e-05,
1086
+ "loss": 1.0139,
1087
+ "step": 1460
1088
+ },
1089
+ {
1090
+ "epoch": 0.53,
1091
+ "grad_norm": 0.3157961666584015,
1092
+ "learning_rate": 9.932101909086056e-05,
1093
+ "loss": 0.993,
1094
+ "step": 1470
1095
+ },
1096
+ {
1097
+ "epoch": 0.54,
1098
+ "grad_norm": 0.22981207072734833,
1099
+ "learning_rate": 9.931164316504904e-05,
1100
+ "loss": 1.0539,
1101
+ "step": 1480
1102
+ },
1103
+ {
1104
+ "epoch": 0.54,
1105
+ "grad_norm": 0.23787029087543488,
1106
+ "learning_rate": 9.930220339578576e-05,
1107
+ "loss": 0.9599,
1108
+ "step": 1490
1109
+ },
1110
+ {
1111
+ "epoch": 0.54,
1112
+ "grad_norm": 0.2633046507835388,
1113
+ "learning_rate": 9.929269979529232e-05,
1114
+ "loss": 0.9813,
1115
+ "step": 1500
1116
+ },
1117
+ {
1118
+ "epoch": 0.55,
1119
+ "grad_norm": 0.2666633725166321,
1120
+ "learning_rate": 9.928313237587296e-05,
1121
+ "loss": 0.9637,
1122
+ "step": 1510
1123
+ },
1124
+ {
1125
+ "epoch": 0.55,
1126
+ "grad_norm": 0.26092538237571716,
1127
+ "learning_rate": 9.927350114991456e-05,
1128
+ "loss": 1.0375,
1129
+ "step": 1520
1130
+ },
1131
+ {
1132
+ "epoch": 0.55,
1133
+ "grad_norm": 0.2837240397930145,
1134
+ "learning_rate": 9.92638061298866e-05,
1135
+ "loss": 1.0053,
1136
+ "step": 1530
1137
+ },
1138
+ {
1139
+ "epoch": 0.56,
1140
+ "grad_norm": 0.2586491107940674,
1141
+ "learning_rate": 9.925404732834117e-05,
1142
+ "loss": 1.0631,
1143
+ "step": 1540
1144
+ },
1145
+ {
1146
+ "epoch": 0.56,
1147
+ "grad_norm": 0.43321874737739563,
1148
+ "learning_rate": 9.924422475791288e-05,
1149
+ "loss": 1.0134,
1150
+ "step": 1550
1151
+ },
1152
+ {
1153
+ "epoch": 0.56,
1154
+ "grad_norm": 0.19062629342079163,
1155
+ "learning_rate": 9.923433843131901e-05,
1156
+ "loss": 0.9989,
1157
+ "step": 1560
1158
+ },
1159
+ {
1160
+ "epoch": 0.57,
1161
+ "grad_norm": 0.34545308351516724,
1162
+ "learning_rate": 9.922438836135928e-05,
1163
+ "loss": 1.0896,
1164
+ "step": 1570
1165
+ },
1166
+ {
1167
+ "epoch": 0.57,
1168
+ "grad_norm": 0.2846600115299225,
1169
+ "learning_rate": 9.921437456091596e-05,
1170
+ "loss": 0.9954,
1171
+ "step": 1580
1172
+ },
1173
+ {
1174
+ "epoch": 0.58,
1175
+ "grad_norm": 0.25403323769569397,
1176
+ "learning_rate": 9.920429704295391e-05,
1177
+ "loss": 0.9937,
1178
+ "step": 1590
1179
+ },
1180
+ {
1181
+ "epoch": 0.58,
1182
+ "grad_norm": 0.23549498617649078,
1183
+ "learning_rate": 9.919415582052036e-05,
1184
+ "loss": 1.0469,
1185
+ "step": 1600
1186
+ },
1187
+ {
1188
+ "epoch": 0.58,
1189
+ "eval_loss": 1.0105613470077515,
1190
+ "eval_runtime": 124.7139,
1191
+ "eval_samples_per_second": 62.551,
1192
+ "eval_steps_per_second": 3.913,
1193
+ "step": 1600
1194
+ },
1195
+ {
1196
+ "epoch": 0.58,
1197
+ "grad_norm": 0.21466514468193054,
1198
+ "learning_rate": 9.918395090674514e-05,
1199
+ "loss": 1.0408,
1200
+ "step": 1610
1201
+ },
1202
+ {
1203
+ "epoch": 0.59,
1204
+ "grad_norm": 0.21247586607933044,
1205
+ "learning_rate": 9.917368231484045e-05,
1206
+ "loss": 0.9893,
1207
+ "step": 1620
1208
+ },
1209
+ {
1210
+ "epoch": 0.59,
1211
+ "grad_norm": 0.26590731739997864,
1212
+ "learning_rate": 9.916335005810095e-05,
1213
+ "loss": 1.0563,
1214
+ "step": 1630
1215
+ },
1216
+ {
1217
+ "epoch": 0.59,
1218
+ "grad_norm": 0.2346472591161728,
1219
+ "learning_rate": 9.91529541499038e-05,
1220
+ "loss": 1.0061,
1221
+ "step": 1640
1222
+ },
1223
+ {
1224
+ "epoch": 0.6,
1225
+ "grad_norm": 0.27766481041908264,
1226
+ "learning_rate": 9.914249460370846e-05,
1227
+ "loss": 0.9639,
1228
+ "step": 1650
1229
+ },
1230
+ {
1231
+ "epoch": 0.6,
1232
+ "grad_norm": 0.24883978068828583,
1233
+ "learning_rate": 9.913197143305684e-05,
1234
+ "loss": 1.0289,
1235
+ "step": 1660
1236
+ },
1237
+ {
1238
+ "epoch": 0.6,
1239
+ "grad_norm": 0.2379382699728012,
1240
+ "learning_rate": 9.912138465157325e-05,
1241
+ "loss": 1.0154,
1242
+ "step": 1670
1243
+ },
1244
+ {
1245
+ "epoch": 0.61,
1246
+ "grad_norm": 0.17160119116306305,
1247
+ "learning_rate": 9.91107342729643e-05,
1248
+ "loss": 1.0002,
1249
+ "step": 1680
1250
+ },
1251
+ {
1252
+ "epoch": 0.61,
1253
+ "grad_norm": 0.2804344892501831,
1254
+ "learning_rate": 9.910002031101895e-05,
1255
+ "loss": 0.9887,
1256
+ "step": 1690
1257
+ },
1258
+ {
1259
+ "epoch": 0.62,
1260
+ "grad_norm": 0.2296508252620697,
1261
+ "learning_rate": 9.908924277960854e-05,
1262
+ "loss": 1.0703,
1263
+ "step": 1700
1264
+ },
1265
+ {
1266
+ "epoch": 0.62,
1267
+ "grad_norm": 0.22265523672103882,
1268
+ "learning_rate": 9.907840169268662e-05,
1269
+ "loss": 0.9495,
1270
+ "step": 1710
1271
+ },
1272
+ {
1273
+ "epoch": 0.62,
1274
+ "grad_norm": 0.3383825123310089,
1275
+ "learning_rate": 9.90674970642891e-05,
1276
+ "loss": 0.9878,
1277
+ "step": 1720
1278
+ },
1279
+ {
1280
+ "epoch": 0.63,
1281
+ "grad_norm": 0.2603285312652588,
1282
+ "learning_rate": 9.905652890853411e-05,
1283
+ "loss": 1.0351,
1284
+ "step": 1730
1285
+ },
1286
+ {
1287
+ "epoch": 0.63,
1288
+ "grad_norm": 0.27001509070396423,
1289
+ "learning_rate": 9.904549723962206e-05,
1290
+ "loss": 1.0528,
1291
+ "step": 1740
1292
+ },
1293
+ {
1294
+ "epoch": 0.63,
1295
+ "grad_norm": 0.34035804867744446,
1296
+ "learning_rate": 9.903440207183558e-05,
1297
+ "loss": 1.0159,
1298
+ "step": 1750
1299
+ },
1300
+ {
1301
+ "epoch": 0.64,
1302
+ "grad_norm": 0.3518404960632324,
1303
+ "learning_rate": 9.90232434195395e-05,
1304
+ "loss": 0.9879,
1305
+ "step": 1760
1306
+ },
1307
+ {
1308
+ "epoch": 0.64,
1309
+ "grad_norm": 0.24958577752113342,
1310
+ "learning_rate": 9.901202129718086e-05,
1311
+ "loss": 1.0221,
1312
+ "step": 1770
1313
+ },
1314
+ {
1315
+ "epoch": 0.64,
1316
+ "grad_norm": 0.23898568749427795,
1317
+ "learning_rate": 9.900073571928886e-05,
1318
+ "loss": 1.037,
1319
+ "step": 1780
1320
+ },
1321
+ {
1322
+ "epoch": 0.65,
1323
+ "grad_norm": 0.22275009751319885,
1324
+ "learning_rate": 9.898938670047486e-05,
1325
+ "loss": 1.0008,
1326
+ "step": 1790
1327
+ },
1328
+ {
1329
+ "epoch": 0.65,
1330
+ "grad_norm": 0.2770971655845642,
1331
+ "learning_rate": 9.897797425543236e-05,
1332
+ "loss": 0.9994,
1333
+ "step": 1800
1334
+ },
1335
+ {
1336
+ "epoch": 0.65,
1337
+ "eval_loss": 1.0094062089920044,
1338
+ "eval_runtime": 124.6598,
1339
+ "eval_samples_per_second": 62.578,
1340
+ "eval_steps_per_second": 3.915,
1341
+ "step": 1800
1342
+ },
1343
+ {
1344
+ "epoch": 0.66,
1345
+ "grad_norm": 0.2470710575580597,
1346
+ "learning_rate": 9.896649839893699e-05,
1347
+ "loss": 1.0093,
1348
+ "step": 1810
1349
+ },
1350
+ {
1351
+ "epoch": 0.66,
1352
+ "grad_norm": 0.31282275915145874,
1353
+ "learning_rate": 9.895495914584643e-05,
1354
+ "loss": 1.0124,
1355
+ "step": 1820
1356
+ },
1357
+ {
1358
+ "epoch": 0.66,
1359
+ "grad_norm": 0.2757389545440674,
1360
+ "learning_rate": 9.894335651110051e-05,
1361
+ "loss": 1.0197,
1362
+ "step": 1830
1363
+ },
1364
+ {
1365
+ "epoch": 0.67,
1366
+ "grad_norm": 0.3123573362827301,
1367
+ "learning_rate": 9.893169050972106e-05,
1368
+ "loss": 0.9469,
1369
+ "step": 1840
1370
+ },
1371
+ {
1372
+ "epoch": 0.67,
1373
+ "grad_norm": 0.4073740839958191,
1374
+ "learning_rate": 9.8919961156812e-05,
1375
+ "loss": 1.0153,
1376
+ "step": 1850
1377
+ },
1378
+ {
1379
+ "epoch": 0.67,
1380
+ "grad_norm": 0.24388962984085083,
1381
+ "learning_rate": 9.89081684675592e-05,
1382
+ "loss": 1.0124,
1383
+ "step": 1860
1384
+ },
1385
+ {
1386
+ "epoch": 0.68,
1387
+ "grad_norm": 0.27508777379989624,
1388
+ "learning_rate": 9.88963124572306e-05,
1389
+ "loss": 0.96,
1390
+ "step": 1870
1391
+ },
1392
+ {
1393
+ "epoch": 0.68,
1394
+ "grad_norm": 0.2843553125858307,
1395
+ "learning_rate": 9.88843931411761e-05,
1396
+ "loss": 1.0448,
1397
+ "step": 1880
1398
+ },
1399
+ {
1400
+ "epoch": 0.68,
1401
+ "grad_norm": 0.25155389308929443,
1402
+ "learning_rate": 9.887241053482757e-05,
1403
+ "loss": 1.0362,
1404
+ "step": 1890
1405
+ },
1406
+ {
1407
+ "epoch": 0.69,
1408
+ "grad_norm": 0.21977895498275757,
1409
+ "learning_rate": 9.886036465369877e-05,
1410
+ "loss": 1.0658,
1411
+ "step": 1900
1412
+ },
1413
+ {
1414
+ "epoch": 0.69,
1415
+ "grad_norm": 0.22326160967350006,
1416
+ "learning_rate": 9.884825551338546e-05,
1417
+ "loss": 1.0068,
1418
+ "step": 1910
1419
+ },
1420
+ {
1421
+ "epoch": 0.69,
1422
+ "grad_norm": 0.3339684307575226,
1423
+ "learning_rate": 9.883608312956524e-05,
1424
+ "loss": 1.0147,
1425
+ "step": 1920
1426
+ },
1427
+ {
1428
+ "epoch": 0.7,
1429
+ "grad_norm": 0.26512840390205383,
1430
+ "learning_rate": 9.882384751799762e-05,
1431
+ "loss": 0.9421,
1432
+ "step": 1930
1433
+ },
1434
+ {
1435
+ "epoch": 0.7,
1436
+ "grad_norm": 0.313123881816864,
1437
+ "learning_rate": 9.881154869452395e-05,
1438
+ "loss": 1.0032,
1439
+ "step": 1940
1440
+ },
1441
+ {
1442
+ "epoch": 0.71,
1443
+ "grad_norm": 0.3562926948070526,
1444
+ "learning_rate": 9.879918667506748e-05,
1445
+ "loss": 1.0491,
1446
+ "step": 1950
1447
+ },
1448
+ {
1449
+ "epoch": 0.71,
1450
+ "grad_norm": 0.373032808303833,
1451
+ "learning_rate": 9.87867614756332e-05,
1452
+ "loss": 0.9823,
1453
+ "step": 1960
1454
+ },
1455
+ {
1456
+ "epoch": 0.71,
1457
+ "grad_norm": 0.2701728641986847,
1458
+ "learning_rate": 9.87742731123079e-05,
1459
+ "loss": 1.0326,
1460
+ "step": 1970
1461
+ },
1462
+ {
1463
+ "epoch": 0.72,
1464
+ "grad_norm": 0.4167492687702179,
1465
+ "learning_rate": 9.876172160126024e-05,
1466
+ "loss": 1.0256,
1467
+ "step": 1980
1468
+ },
1469
+ {
1470
+ "epoch": 0.72,
1471
+ "grad_norm": 0.2636062800884247,
1472
+ "learning_rate": 9.874910695874053e-05,
1473
+ "loss": 1.0301,
1474
+ "step": 1990
1475
+ },
1476
+ {
1477
+ "epoch": 0.72,
1478
+ "grad_norm": 0.27048760652542114,
1479
+ "learning_rate": 9.873642920108091e-05,
1480
+ "loss": 1.0141,
1481
+ "step": 2000
1482
+ },
1483
+ {
1484
+ "epoch": 0.72,
1485
+ "eval_loss": 1.0082145929336548,
1486
+ "eval_runtime": 124.7209,
1487
+ "eval_samples_per_second": 62.548,
1488
+ "eval_steps_per_second": 3.913,
1489
+ "step": 2000
1490
+ },
1491
+ {
1492
+ "epoch": 0.73,
1493
+ "grad_norm": 0.26596397161483765,
1494
+ "learning_rate": 9.872368834469514e-05,
1495
+ "loss": 0.9554,
1496
+ "step": 2010
1497
+ },
1498
+ {
1499
+ "epoch": 0.73,
1500
+ "grad_norm": 0.3881726861000061,
1501
+ "learning_rate": 9.871088440607874e-05,
1502
+ "loss": 1.0374,
1503
+ "step": 2020
1504
+ },
1505
+ {
1506
+ "epoch": 0.73,
1507
+ "grad_norm": 0.345869243144989,
1508
+ "learning_rate": 9.869801740180889e-05,
1509
+ "loss": 1.01,
1510
+ "step": 2030
1511
+ },
1512
+ {
1513
+ "epoch": 0.74,
1514
+ "grad_norm": 0.3740908205509186,
1515
+ "learning_rate": 9.86850873485444e-05,
1516
+ "loss": 1.0244,
1517
+ "step": 2040
1518
+ },
1519
+ {
1520
+ "epoch": 0.74,
1521
+ "grad_norm": 0.3265666663646698,
1522
+ "learning_rate": 9.867209426302572e-05,
1523
+ "loss": 0.9303,
1524
+ "step": 2050
1525
+ },
1526
+ {
1527
+ "epoch": 0.75,
1528
+ "grad_norm": 0.381783664226532,
1529
+ "learning_rate": 9.865903816207493e-05,
1530
+ "loss": 1.0851,
1531
+ "step": 2060
1532
+ },
1533
+ {
1534
+ "epoch": 0.75,
1535
+ "grad_norm": 0.30846527218818665,
1536
+ "learning_rate": 9.864591906259568e-05,
1537
+ "loss": 1.0042,
1538
+ "step": 2070
1539
+ },
1540
+ {
1541
+ "epoch": 0.75,
1542
+ "grad_norm": 0.36899617314338684,
1543
+ "learning_rate": 9.863273698157315e-05,
1544
+ "loss": 0.9866,
1545
+ "step": 2080
1546
+ },
1547
+ {
1548
+ "epoch": 0.76,
1549
+ "grad_norm": 0.25415265560150146,
1550
+ "learning_rate": 9.861949193607411e-05,
1551
+ "loss": 1.056,
1552
+ "step": 2090
1553
+ },
1554
+ {
1555
+ "epoch": 0.76,
1556
+ "grad_norm": 0.3369081914424896,
1557
+ "learning_rate": 9.860618394324682e-05,
1558
+ "loss": 0.9988,
1559
+ "step": 2100
1560
+ },
1561
+ {
1562
+ "epoch": 0.76,
1563
+ "grad_norm": 0.19644911587238312,
1564
+ "learning_rate": 9.859281302032106e-05,
1565
+ "loss": 0.9562,
1566
+ "step": 2110
1567
+ },
1568
+ {
1569
+ "epoch": 0.77,
1570
+ "grad_norm": 0.3449130356311798,
1571
+ "learning_rate": 9.857937918460808e-05,
1572
+ "loss": 1.0325,
1573
+ "step": 2120
1574
+ },
1575
+ {
1576
+ "epoch": 0.77,
1577
+ "grad_norm": 0.2639143764972687,
1578
+ "learning_rate": 9.856588245350056e-05,
1579
+ "loss": 1.0458,
1580
+ "step": 2130
1581
+ },
1582
+ {
1583
+ "epoch": 0.77,
1584
+ "grad_norm": 0.2752164602279663,
1585
+ "learning_rate": 9.855232284447262e-05,
1586
+ "loss": 1.089,
1587
+ "step": 2140
1588
+ },
1589
+ {
1590
+ "epoch": 0.78,
1591
+ "grad_norm": 0.31700417399406433,
1592
+ "learning_rate": 9.853870037507983e-05,
1593
+ "loss": 1.0398,
1594
+ "step": 2150
1595
+ },
1596
+ {
1597
+ "epoch": 0.78,
1598
+ "grad_norm": 0.24685466289520264,
1599
+ "learning_rate": 9.852501506295907e-05,
1600
+ "loss": 1.0038,
1601
+ "step": 2160
1602
+ },
1603
+ {
1604
+ "epoch": 0.79,
1605
+ "grad_norm": 0.28860118985176086,
1606
+ "learning_rate": 9.851126692582864e-05,
1607
+ "loss": 1.0343,
1608
+ "step": 2170
1609
+ },
1610
+ {
1611
+ "epoch": 0.79,
1612
+ "grad_norm": 0.2774854898452759,
1613
+ "learning_rate": 9.849745598148817e-05,
1614
+ "loss": 0.9986,
1615
+ "step": 2180
1616
+ },
1617
+ {
1618
+ "epoch": 0.79,
1619
+ "grad_norm": 0.28867611289024353,
1620
+ "learning_rate": 9.848358224781857e-05,
1621
+ "loss": 1.035,
1622
+ "step": 2190
1623
+ },
1624
+ {
1625
+ "epoch": 0.8,
1626
+ "grad_norm": 0.2703929841518402,
1627
+ "learning_rate": 9.84696457427821e-05,
1628
+ "loss": 1.0891,
1629
+ "step": 2200
1630
+ },
1631
+ {
1632
+ "epoch": 0.8,
1633
+ "eval_loss": 1.0072919130325317,
1634
+ "eval_runtime": 125.0779,
1635
+ "eval_samples_per_second": 62.369,
1636
+ "eval_steps_per_second": 3.902,
1637
+ "step": 2200
1638
+ },
1639
+ {
1640
+ "epoch": 0.8,
1641
+ "grad_norm": 0.3247489035129547,
1642
+ "learning_rate": 9.845564648442222e-05,
1643
+ "loss": 1.0259,
1644
+ "step": 2210
1645
+ },
1646
+ {
1647
+ "epoch": 0.8,
1648
+ "grad_norm": 0.2535197138786316,
1649
+ "learning_rate": 9.844158449086371e-05,
1650
+ "loss": 1.0457,
1651
+ "step": 2220
1652
+ },
1653
+ {
1654
+ "epoch": 0.81,
1655
+ "grad_norm": 0.26780492067337036,
1656
+ "learning_rate": 9.842745978031253e-05,
1657
+ "loss": 0.9869,
1658
+ "step": 2230
1659
+ },
1660
+ {
1661
+ "epoch": 0.81,
1662
+ "grad_norm": 0.29711589217185974,
1663
+ "learning_rate": 9.841327237105585e-05,
1664
+ "loss": 1.0158,
1665
+ "step": 2240
1666
+ },
1667
+ {
1668
+ "epoch": 0.81,
1669
+ "grad_norm": 0.239434614777565,
1670
+ "learning_rate": 9.8399022281462e-05,
1671
+ "loss": 0.997,
1672
+ "step": 2250
1673
+ },
1674
+ {
1675
+ "epoch": 0.82,
1676
+ "grad_norm": 0.2368830293416977,
1677
+ "learning_rate": 9.838470952998049e-05,
1678
+ "loss": 1.0148,
1679
+ "step": 2260
1680
+ },
1681
+ {
1682
+ "epoch": 0.82,
1683
+ "grad_norm": 0.2554934322834015,
1684
+ "learning_rate": 9.837033413514191e-05,
1685
+ "loss": 0.9787,
1686
+ "step": 2270
1687
+ },
1688
+ {
1689
+ "epoch": 0.83,
1690
+ "grad_norm": 0.2310570627450943,
1691
+ "learning_rate": 9.835589611555805e-05,
1692
+ "loss": 0.9656,
1693
+ "step": 2280
1694
+ },
1695
+ {
1696
+ "epoch": 0.83,
1697
+ "grad_norm": 0.22654668986797333,
1698
+ "learning_rate": 9.834139548992165e-05,
1699
+ "loss": 0.9837,
1700
+ "step": 2290
1701
+ },
1702
+ {
1703
+ "epoch": 0.83,
1704
+ "grad_norm": 0.25957950949668884,
1705
+ "learning_rate": 9.832683227700661e-05,
1706
+ "loss": 1.0513,
1707
+ "step": 2300
1708
+ },
1709
+ {
1710
+ "epoch": 0.84,
1711
+ "grad_norm": 0.20669637620449066,
1712
+ "learning_rate": 9.831220649566782e-05,
1713
+ "loss": 0.9649,
1714
+ "step": 2310
1715
+ },
1716
+ {
1717
+ "epoch": 0.84,
1718
+ "grad_norm": 0.24330663681030273,
1719
+ "learning_rate": 9.829751816484116e-05,
1720
+ "loss": 1.0208,
1721
+ "step": 2320
1722
+ },
1723
+ {
1724
+ "epoch": 0.84,
1725
+ "grad_norm": 0.28211724758148193,
1726
+ "learning_rate": 9.828276730354353e-05,
1727
+ "loss": 0.9512,
1728
+ "step": 2330
1729
+ },
1730
+ {
1731
+ "epoch": 0.85,
1732
+ "grad_norm": 0.23784276843070984,
1733
+ "learning_rate": 9.826795393087278e-05,
1734
+ "loss": 0.976,
1735
+ "step": 2340
1736
+ },
1737
+ {
1738
+ "epoch": 0.85,
1739
+ "grad_norm": 0.2881389260292053,
1740
+ "learning_rate": 9.825307806600765e-05,
1741
+ "loss": 1.0036,
1742
+ "step": 2350
1743
+ },
1744
+ {
1745
+ "epoch": 0.85,
1746
+ "grad_norm": 0.27906882762908936,
1747
+ "learning_rate": 9.823813972820786e-05,
1748
+ "loss": 1.0555,
1749
+ "step": 2360
1750
+ },
1751
+ {
1752
+ "epoch": 0.86,
1753
+ "grad_norm": 0.25142115354537964,
1754
+ "learning_rate": 9.822313893681397e-05,
1755
+ "loss": 1.0483,
1756
+ "step": 2370
1757
+ },
1758
+ {
1759
+ "epoch": 0.86,
1760
+ "grad_norm": 0.244681715965271,
1761
+ "learning_rate": 9.820807571124738e-05,
1762
+ "loss": 1.0102,
1763
+ "step": 2380
1764
+ },
1765
+ {
1766
+ "epoch": 0.87,
1767
+ "grad_norm": 0.3696367144584656,
1768
+ "learning_rate": 9.819295007101035e-05,
1769
+ "loss": 1.0626,
1770
+ "step": 2390
1771
+ },
1772
+ {
1773
+ "epoch": 0.87,
1774
+ "grad_norm": 0.26112619042396545,
1775
+ "learning_rate": 9.817776203568596e-05,
1776
+ "loss": 1.0141,
1777
+ "step": 2400
1778
+ },
1779
+ {
1780
+ "epoch": 0.87,
1781
+ "eval_loss": 1.0063296556472778,
1782
+ "eval_runtime": 125.7335,
1783
+ "eval_samples_per_second": 62.044,
1784
+ "eval_steps_per_second": 3.881,
1785
+ "step": 2400
1786
+ },
1787
+ {
1788
+ "epoch": 0.87,
1789
+ "grad_norm": 0.25221410393714905,
1790
+ "learning_rate": 9.816251162493804e-05,
1791
+ "loss": 1.0222,
1792
+ "step": 2410
1793
+ },
1794
+ {
1795
+ "epoch": 0.88,
1796
+ "grad_norm": 0.19672074913978577,
1797
+ "learning_rate": 9.814719885851121e-05,
1798
+ "loss": 0.9891,
1799
+ "step": 2420
1800
+ },
1801
+ {
1802
+ "epoch": 0.88,
1803
+ "grad_norm": 0.3084292411804199,
1804
+ "learning_rate": 9.81318237562308e-05,
1805
+ "loss": 0.9785,
1806
+ "step": 2430
1807
+ },
1808
+ {
1809
+ "epoch": 0.88,
1810
+ "grad_norm": 0.3434545397758484,
1811
+ "learning_rate": 9.811638633800287e-05,
1812
+ "loss": 0.9357,
1813
+ "step": 2440
1814
+ },
1815
+ {
1816
+ "epoch": 0.89,
1817
+ "grad_norm": 0.23335447907447815,
1818
+ "learning_rate": 9.81008866238141e-05,
1819
+ "loss": 1.0485,
1820
+ "step": 2450
1821
+ },
1822
+ {
1823
+ "epoch": 0.89,
1824
+ "grad_norm": 0.2942172586917877,
1825
+ "learning_rate": 9.808532463373188e-05,
1826
+ "loss": 1.0138,
1827
+ "step": 2460
1828
+ },
1829
+ {
1830
+ "epoch": 0.89,
1831
+ "grad_norm": 0.22536420822143555,
1832
+ "learning_rate": 9.806970038790423e-05,
1833
+ "loss": 1.0421,
1834
+ "step": 2470
1835
+ },
1836
+ {
1837
+ "epoch": 0.9,
1838
+ "grad_norm": 0.30886924266815186,
1839
+ "learning_rate": 9.805401390655975e-05,
1840
+ "loss": 0.9926,
1841
+ "step": 2480
1842
+ },
1843
+ {
1844
+ "epoch": 0.9,
1845
+ "grad_norm": 0.34105512499809265,
1846
+ "learning_rate": 9.803826521000761e-05,
1847
+ "loss": 1.0013,
1848
+ "step": 2490
1849
+ },
1850
+ {
1851
+ "epoch": 0.9,
1852
+ "grad_norm": 0.261643648147583,
1853
+ "learning_rate": 9.802245431863757e-05,
1854
+ "loss": 0.9937,
1855
+ "step": 2500
1856
+ },
1857
+ {
1858
+ "epoch": 0.91,
1859
+ "grad_norm": 0.3864617347717285,
1860
+ "learning_rate": 9.800658125291984e-05,
1861
+ "loss": 0.9986,
1862
+ "step": 2510
1863
+ },
1864
+ {
1865
+ "epoch": 0.91,
1866
+ "grad_norm": 0.31850436329841614,
1867
+ "learning_rate": 9.79906460334052e-05,
1868
+ "loss": 0.9984,
1869
+ "step": 2520
1870
+ },
1871
+ {
1872
+ "epoch": 0.92,
1873
+ "grad_norm": 0.25421255826950073,
1874
+ "learning_rate": 9.797464868072488e-05,
1875
+ "loss": 1.0273,
1876
+ "step": 2530
1877
+ },
1878
+ {
1879
+ "epoch": 0.92,
1880
+ "grad_norm": 0.34440311789512634,
1881
+ "learning_rate": 9.795858921559052e-05,
1882
+ "loss": 1.0346,
1883
+ "step": 2540
1884
+ },
1885
+ {
1886
+ "epoch": 0.92,
1887
+ "grad_norm": 0.33147209882736206,
1888
+ "learning_rate": 9.79424676587942e-05,
1889
+ "loss": 1.0691,
1890
+ "step": 2550
1891
+ },
1892
+ {
1893
+ "epoch": 0.93,
1894
+ "grad_norm": 0.2778458893299103,
1895
+ "learning_rate": 9.792628403120842e-05,
1896
+ "loss": 1.009,
1897
+ "step": 2560
1898
+ },
1899
+ {
1900
+ "epoch": 0.93,
1901
+ "grad_norm": 0.29282572865486145,
1902
+ "learning_rate": 9.791003835378598e-05,
1903
+ "loss": 1.0015,
1904
+ "step": 2570
1905
+ },
1906
+ {
1907
+ "epoch": 0.93,
1908
+ "grad_norm": 0.25391730666160583,
1909
+ "learning_rate": 9.789373064756008e-05,
1910
+ "loss": 1.0177,
1911
+ "step": 2580
1912
+ },
1913
+ {
1914
+ "epoch": 0.94,
1915
+ "grad_norm": 0.23779381811618805,
1916
+ "learning_rate": 9.787736093364416e-05,
1917
+ "loss": 1.0935,
1918
+ "step": 2590
1919
+ },
1920
+ {
1921
+ "epoch": 0.94,
1922
+ "grad_norm": 0.2965840995311737,
1923
+ "learning_rate": 9.786092923323203e-05,
1924
+ "loss": 1.0002,
1925
+ "step": 2600
1926
+ },
1927
+ {
1928
+ "epoch": 0.94,
1929
+ "eval_loss": 1.005922555923462,
1930
+ "eval_runtime": 125.0587,
1931
+ "eval_samples_per_second": 62.379,
1932
+ "eval_steps_per_second": 3.902,
1933
+ "step": 2600
1934
+ },
1935
+ {
1936
+ "epoch": 0.94,
1937
+ "grad_norm": 0.23760788142681122,
1938
+ "learning_rate": 9.784443556759766e-05,
1939
+ "loss": 1.0305,
1940
+ "step": 2610
1941
+ },
1942
+ {
1943
+ "epoch": 0.95,
1944
+ "grad_norm": 0.22895409166812897,
1945
+ "learning_rate": 9.78278799580953e-05,
1946
+ "loss": 1.0427,
1947
+ "step": 2620
1948
+ },
1949
+ {
1950
+ "epoch": 0.95,
1951
+ "grad_norm": 0.36007368564605713,
1952
+ "learning_rate": 9.781126242615939e-05,
1953
+ "loss": 1.0059,
1954
+ "step": 2630
1955
+ },
1956
+ {
1957
+ "epoch": 0.96,
1958
+ "grad_norm": 0.2813151776790619,
1959
+ "learning_rate": 9.779458299330452e-05,
1960
+ "loss": 1.0418,
1961
+ "step": 2640
1962
+ },
1963
+ {
1964
+ "epoch": 0.96,
1965
+ "grad_norm": 0.27038782835006714,
1966
+ "learning_rate": 9.777784168112545e-05,
1967
+ "loss": 1.0092,
1968
+ "step": 2650
1969
+ },
1970
+ {
1971
+ "epoch": 0.96,
1972
+ "grad_norm": 0.22898097336292267,
1973
+ "learning_rate": 9.776103851129706e-05,
1974
+ "loss": 0.9883,
1975
+ "step": 2660
1976
+ },
1977
+ {
1978
+ "epoch": 0.97,
1979
+ "grad_norm": 0.2213810682296753,
1980
+ "learning_rate": 9.774417350557428e-05,
1981
+ "loss": 1.0753,
1982
+ "step": 2670
1983
+ },
1984
+ {
1985
+ "epoch": 0.97,
1986
+ "grad_norm": 0.22410623729228973,
1987
+ "learning_rate": 9.772724668579212e-05,
1988
+ "loss": 1.0524,
1989
+ "step": 2680
1990
+ },
1991
+ {
1992
+ "epoch": 0.97,
1993
+ "grad_norm": 0.3005650043487549,
1994
+ "learning_rate": 9.771025807386562e-05,
1995
+ "loss": 1.0562,
1996
+ "step": 2690
1997
+ },
1998
+ {
1999
+ "epoch": 0.98,
2000
+ "grad_norm": 0.3941683769226074,
2001
+ "learning_rate": 9.769320769178983e-05,
2002
+ "loss": 0.9925,
2003
+ "step": 2700
2004
+ },
2005
+ {
2006
+ "epoch": 0.98,
2007
+ "grad_norm": 0.2829142212867737,
2008
+ "learning_rate": 9.767609556163977e-05,
2009
+ "loss": 1.014,
2010
+ "step": 2710
2011
+ },
2012
+ {
2013
+ "epoch": 0.98,
2014
+ "grad_norm": 0.29680418968200684,
2015
+ "learning_rate": 9.765892170557038e-05,
2016
+ "loss": 0.9677,
2017
+ "step": 2720
2018
+ },
2019
+ {
2020
+ "epoch": 0.99,
2021
+ "grad_norm": 0.22002767026424408,
2022
+ "learning_rate": 9.764168614581655e-05,
2023
+ "loss": 0.9954,
2024
+ "step": 2730
2025
+ },
2026
+ {
2027
+ "epoch": 0.99,
2028
+ "grad_norm": 0.2758820354938507,
2029
+ "learning_rate": 9.762438890469304e-05,
2030
+ "loss": 1.0029,
2031
+ "step": 2740
2032
+ },
2033
+ {
2034
+ "epoch": 1.0,
2035
+ "grad_norm": 0.2981850802898407,
2036
+ "learning_rate": 9.760703000459446e-05,
2037
+ "loss": 1.0555,
2038
+ "step": 2750
2039
+ },
2040
+ {
2041
+ "epoch": 1.0,
2042
+ "grad_norm": 0.22340857982635498,
2043
+ "learning_rate": 9.758960946799528e-05,
2044
+ "loss": 1.0394,
2045
+ "step": 2760
2046
+ },
2047
+ {
2048
+ "epoch": 1.0,
2049
+ "grad_norm": 0.19991633296012878,
2050
+ "learning_rate": 9.757212731744974e-05,
2051
+ "loss": 0.9325,
2052
+ "step": 2770
2053
+ },
2054
+ {
2055
+ "epoch": 1.01,
2056
+ "grad_norm": 0.30030888319015503,
2057
+ "learning_rate": 9.755458357559186e-05,
2058
+ "loss": 0.9711,
2059
+ "step": 2780
2060
+ },
2061
+ {
2062
+ "epoch": 1.01,
2063
+ "grad_norm": 0.3804832696914673,
2064
+ "learning_rate": 9.753697826513541e-05,
2065
+ "loss": 0.9651,
2066
+ "step": 2790
2067
+ },
2068
+ {
2069
+ "epoch": 1.01,
2070
+ "grad_norm": 0.46047547459602356,
2071
+ "learning_rate": 9.751931140887387e-05,
2072
+ "loss": 0.9686,
2073
+ "step": 2800
2074
+ },
2075
+ {
2076
+ "epoch": 1.01,
2077
+ "eval_loss": 1.0086077451705933,
2078
+ "eval_runtime": 124.6354,
2079
+ "eval_samples_per_second": 62.591,
2080
+ "eval_steps_per_second": 3.915,
2081
+ "step": 2800
2082
+ },
2083
+ {
2084
+ "epoch": 1.02,
2085
+ "grad_norm": 0.30646952986717224,
2086
+ "learning_rate": 9.750158302968039e-05,
2087
+ "loss": 0.9267,
2088
+ "step": 2810
2089
+ },
2090
+ {
2091
+ "epoch": 1.02,
2092
+ "grad_norm": 0.3007545471191406,
2093
+ "learning_rate": 9.748379315050778e-05,
2094
+ "loss": 1.0193,
2095
+ "step": 2820
2096
+ },
2097
+ {
2098
+ "epoch": 1.02,
2099
+ "grad_norm": 0.2814784049987793,
2100
+ "learning_rate": 9.74659417943885e-05,
2101
+ "loss": 0.8893,
2102
+ "step": 2830
2103
+ },
2104
+ {
2105
+ "epoch": 1.03,
2106
+ "grad_norm": 0.2728348970413208,
2107
+ "learning_rate": 9.744802898443456e-05,
2108
+ "loss": 0.937,
2109
+ "step": 2840
2110
+ },
2111
+ {
2112
+ "epoch": 1.03,
2113
+ "grad_norm": 0.2994844913482666,
2114
+ "learning_rate": 9.743005474383755e-05,
2115
+ "loss": 0.949,
2116
+ "step": 2850
2117
+ },
2118
+ {
2119
+ "epoch": 1.04,
2120
+ "grad_norm": 0.43111738562583923,
2121
+ "learning_rate": 9.741201909586861e-05,
2122
+ "loss": 0.9897,
2123
+ "step": 2860
2124
+ },
2125
+ {
2126
+ "epoch": 1.04,
2127
+ "grad_norm": 0.29551658034324646,
2128
+ "learning_rate": 9.739392206387838e-05,
2129
+ "loss": 0.9393,
2130
+ "step": 2870
2131
+ },
2132
+ {
2133
+ "epoch": 1.04,
2134
+ "grad_norm": 0.40380623936653137,
2135
+ "learning_rate": 9.737576367129694e-05,
2136
+ "loss": 0.9365,
2137
+ "step": 2880
2138
+ },
2139
+ {
2140
+ "epoch": 1.05,
2141
+ "grad_norm": 0.2757427394390106,
2142
+ "learning_rate": 9.735754394163386e-05,
2143
+ "loss": 1.0074,
2144
+ "step": 2890
2145
+ },
2146
+ {
2147
+ "epoch": 1.05,
2148
+ "grad_norm": 0.35594430565834045,
2149
+ "learning_rate": 9.73392628984781e-05,
2150
+ "loss": 0.9682,
2151
+ "step": 2900
2152
+ },
2153
+ {
2154
+ "epoch": 1.05,
2155
+ "grad_norm": 0.32288888096809387,
2156
+ "learning_rate": 9.732092056549799e-05,
2157
+ "loss": 0.9753,
2158
+ "step": 2910
2159
+ },
2160
+ {
2161
+ "epoch": 1.06,
2162
+ "grad_norm": 0.3491690158843994,
2163
+ "learning_rate": 9.730251696644122e-05,
2164
+ "loss": 0.926,
2165
+ "step": 2920
2166
+ },
2167
+ {
2168
+ "epoch": 1.06,
2169
+ "grad_norm": 0.41806405782699585,
2170
+ "learning_rate": 9.728405212513483e-05,
2171
+ "loss": 0.9993,
2172
+ "step": 2930
2173
+ },
2174
+ {
2175
+ "epoch": 1.06,
2176
+ "grad_norm": 0.4885188043117523,
2177
+ "learning_rate": 9.726552606548512e-05,
2178
+ "loss": 0.9879,
2179
+ "step": 2940
2180
+ },
2181
+ {
2182
+ "epoch": 1.07,
2183
+ "grad_norm": 0.41796302795410156,
2184
+ "learning_rate": 9.724693881147761e-05,
2185
+ "loss": 0.9626,
2186
+ "step": 2950
2187
+ },
2188
+ {
2189
+ "epoch": 1.07,
2190
+ "grad_norm": 0.39677631855010986,
2191
+ "learning_rate": 9.722829038717717e-05,
2192
+ "loss": 0.9767,
2193
+ "step": 2960
2194
+ },
2195
+ {
2196
+ "epoch": 1.07,
2197
+ "grad_norm": 0.5329232215881348,
2198
+ "learning_rate": 9.720958081672773e-05,
2199
+ "loss": 0.9357,
2200
+ "step": 2970
2201
+ },
2202
+ {
2203
+ "epoch": 1.08,
2204
+ "grad_norm": 0.4468931257724762,
2205
+ "learning_rate": 9.719081012435247e-05,
2206
+ "loss": 0.9705,
2207
+ "step": 2980
2208
+ },
2209
+ {
2210
+ "epoch": 1.08,
2211
+ "grad_norm": 0.4029316306114197,
2212
+ "learning_rate": 9.717197833435367e-05,
2213
+ "loss": 0.9727,
2214
+ "step": 2990
2215
+ },
2216
+ {
2217
+ "epoch": 1.09,
2218
+ "grad_norm": 0.37598028779029846,
2219
+ "learning_rate": 9.715308547111273e-05,
2220
+ "loss": 0.9767,
2221
+ "step": 3000
2222
+ },
2223
+ {
2224
+ "epoch": 1.09,
2225
+ "eval_loss": 1.014098048210144,
2226
+ "eval_runtime": 125.4232,
2227
+ "eval_samples_per_second": 62.197,
2228
+ "eval_steps_per_second": 3.891,
2229
+ "step": 3000
2230
+ },
2231
+ {
2232
+ "epoch": 1.09,
2233
+ "grad_norm": 0.3833357095718384,
2234
+ "learning_rate": 9.713413155909009e-05,
2235
+ "loss": 0.9605,
2236
+ "step": 3010
2237
+ },
2238
+ {
2239
+ "epoch": 1.09,
2240
+ "grad_norm": 0.4391871988773346,
2241
+ "learning_rate": 9.711511662282527e-05,
2242
+ "loss": 0.9611,
2243
+ "step": 3020
2244
+ },
2245
+ {
2246
+ "epoch": 1.1,
2247
+ "grad_norm": 0.39860454201698303,
2248
+ "learning_rate": 9.709604068693679e-05,
2249
+ "loss": 0.9222,
2250
+ "step": 3030
2251
+ },
2252
+ {
2253
+ "epoch": 1.1,
2254
+ "grad_norm": 0.33882561326026917,
2255
+ "learning_rate": 9.707690377612211e-05,
2256
+ "loss": 0.9369,
2257
+ "step": 3040
2258
+ },
2259
+ {
2260
+ "epoch": 1.1,
2261
+ "grad_norm": 0.3763039708137512,
2262
+ "learning_rate": 9.705770591515768e-05,
2263
+ "loss": 0.8864,
2264
+ "step": 3050
2265
+ },
2266
+ {
2267
+ "epoch": 1.11,
2268
+ "grad_norm": 0.3221600353717804,
2269
+ "learning_rate": 9.703844712889884e-05,
2270
+ "loss": 0.9753,
2271
+ "step": 3060
2272
+ },
2273
+ {
2274
+ "epoch": 1.11,
2275
+ "grad_norm": 0.3342023491859436,
2276
+ "learning_rate": 9.701912744227979e-05,
2277
+ "loss": 0.9233,
2278
+ "step": 3070
2279
+ },
2280
+ {
2281
+ "epoch": 1.11,
2282
+ "grad_norm": 0.4082651734352112,
2283
+ "learning_rate": 9.699974688031363e-05,
2284
+ "loss": 0.987,
2285
+ "step": 3080
2286
+ },
2287
+ {
2288
+ "epoch": 1.12,
2289
+ "grad_norm": 0.4198564291000366,
2290
+ "learning_rate": 9.69803054680922e-05,
2291
+ "loss": 0.8833,
2292
+ "step": 3090
2293
+ },
2294
+ {
2295
+ "epoch": 1.12,
2296
+ "grad_norm": 0.3833492398262024,
2297
+ "learning_rate": 9.696080323078621e-05,
2298
+ "loss": 0.9894,
2299
+ "step": 3100
2300
+ },
2301
+ {
2302
+ "epoch": 1.13,
2303
+ "grad_norm": 0.35935208201408386,
2304
+ "learning_rate": 9.694124019364505e-05,
2305
+ "loss": 0.9417,
2306
+ "step": 3110
2307
+ },
2308
+ {
2309
+ "epoch": 1.13,
2310
+ "grad_norm": 0.3433043658733368,
2311
+ "learning_rate": 9.692161638199686e-05,
2312
+ "loss": 0.9251,
2313
+ "step": 3120
2314
+ },
2315
+ {
2316
+ "epoch": 1.13,
2317
+ "grad_norm": 0.30163127183914185,
2318
+ "learning_rate": 9.690193182124844e-05,
2319
+ "loss": 0.9447,
2320
+ "step": 3130
2321
+ },
2322
+ {
2323
+ "epoch": 1.14,
2324
+ "grad_norm": 0.4361821711063385,
2325
+ "learning_rate": 9.68821865368853e-05,
2326
+ "loss": 0.9984,
2327
+ "step": 3140
2328
+ },
2329
+ {
2330
+ "epoch": 1.14,
2331
+ "grad_norm": 0.4263075888156891,
2332
+ "learning_rate": 9.686238055447148e-05,
2333
+ "loss": 0.9422,
2334
+ "step": 3150
2335
+ },
2336
+ {
2337
+ "epoch": 1.14,
2338
+ "grad_norm": 0.33963072299957275,
2339
+ "learning_rate": 9.684251389964967e-05,
2340
+ "loss": 0.9199,
2341
+ "step": 3160
2342
+ },
2343
+ {
2344
+ "epoch": 1.15,
2345
+ "grad_norm": 0.41040754318237305,
2346
+ "learning_rate": 9.68225865981411e-05,
2347
+ "loss": 0.9249,
2348
+ "step": 3170
2349
+ },
2350
+ {
2351
+ "epoch": 1.15,
2352
+ "grad_norm": 0.3697950839996338,
2353
+ "learning_rate": 9.680259867574552e-05,
2354
+ "loss": 0.947,
2355
+ "step": 3180
2356
+ },
2357
+ {
2358
+ "epoch": 1.15,
2359
+ "grad_norm": 0.3211696743965149,
2360
+ "learning_rate": 9.678255015834112e-05,
2361
+ "loss": 0.9956,
2362
+ "step": 3190
2363
+ },
2364
+ {
2365
+ "epoch": 1.16,
2366
+ "grad_norm": 0.4463675022125244,
2367
+ "learning_rate": 9.676244107188463e-05,
2368
+ "loss": 0.9494,
2369
+ "step": 3200
2370
+ },
2371
+ {
2372
+ "epoch": 1.16,
2373
+ "eval_loss": 1.0160499811172485,
2374
+ "eval_runtime": 124.6588,
2375
+ "eval_samples_per_second": 62.579,
2376
+ "eval_steps_per_second": 3.915,
2377
+ "step": 3200
2378
+ },
2379
+ {
2380
+ "epoch": 1.16,
2381
+ "step": 3200,
2382
+ "total_flos": 8.146148608211681e+17,
2383
+ "train_loss": 1.015018144249916,
2384
+ "train_runtime": 4695.8401,
2385
+ "train_samples_per_second": 94.128,
2386
+ "train_steps_per_second": 5.884
2387
+ }
2388
+ ],
2389
+ "logging_steps": 10,
2390
+ "max_steps": 27630,
2391
+ "num_input_tokens_seen": 0,
2392
+ "num_train_epochs": 10,
2393
+ "save_steps": 1000,
2394
+ "total_flos": 8.146148608211681e+17,
2395
+ "train_batch_size": 8,
2396
+ "trial_name": null,
2397
+ "trial_params": null
2398
+ }
llama2_13b_peft/alpaca/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee00a18a03b57747a77ea7ce888e4df0481f3a55be563ee85ba65d3e1221563f
3
+ size 5048
llama2_13b_peft/alpaca/training_eval_loss.png ADDED
llama2_13b_peft/alpaca/training_loss.png ADDED
llama2_13b_peft/cnn_dailymail/README.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - llama-factory
6
+ - lora
7
+ - generated_from_trainer
8
+ base_model: /data1/model/llama2/meta-llama/Llama2-13b
9
+ model-index:
10
+ - name: cnn_dailymail_no_sys
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # cnn_dailymail_no_sys
18
+
19
+ This model is a fine-tuned version of [/data1/model/llama2/meta-llama/Llama2-13b](https://huggingface.co//data1/model/llama2/meta-llama/Llama2-13b) on the cnn_dailymail_no_sys dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.9911
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 0.0001
41
+ - train_batch_size: 8
42
+ - eval_batch_size: 8
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 3
46
+ - total_train_batch_size: 24
47
+ - total_eval_batch_size: 24
48
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
+ - lr_scheduler_type: cosine
50
+ - lr_scheduler_warmup_steps: 20
51
+ - num_epochs: 10.0
52
+
53
+ ### Training results
54
+
55
+ | Training Loss | Epoch | Step | Validation Loss |
56
+ |:-------------:|:-----:|:----:|:---------------:|
57
+ | 1.0297 | 0.14 | 50 | 1.0376 |
58
+ | 1.0043 | 0.28 | 100 | 1.0226 |
59
+ | 1.0241 | 0.42 | 150 | 1.0131 |
60
+ | 1.0134 | 0.56 | 200 | 1.0083 |
61
+ | 1.0284 | 0.7 | 250 | 1.0041 |
62
+ | 1.0126 | 0.85 | 300 | 0.9974 |
63
+ | 0.9779 | 0.99 | 350 | 0.9922 |
64
+ | 0.9239 | 1.13 | 400 | 0.9984 |
65
+ | 0.9456 | 1.27 | 450 | 0.9971 |
66
+ | 0.9064 | 1.41 | 500 | 0.9911 |
67
+ | 0.9238 | 1.55 | 550 | 0.9884 |
68
+ | 0.9053 | 1.69 | 600 | 0.9863 |
69
+ | 0.9066 | 1.83 | 650 | 0.9829 |
70
+ | 0.932 | 1.97 | 700 | 0.9802 |
71
+ | 0.7339 | 2.11 | 750 | 1.0370 |
72
+ | 0.7678 | 2.25 | 800 | 1.0321 |
73
+ | 0.7538 | 2.39 | 850 | 1.0240 |
74
+
75
+
76
+ ### Framework versions
77
+
78
+ - PEFT 0.9.0
79
+ - Transformers 4.38.2
80
+ - Pytorch 2.2.1
81
+ - Datasets 2.18.0
82
+ - Tokenizers 0.15.2
llama2_13b_peft/cnn_dailymail/adapter_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/data1/model/llama2/meta-llama/Llama2-13b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.0,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 8,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "o_proj",
23
+ "up_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "down_proj",
27
+ "q_proj",
28
+ "gate_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "use_dora": false,
32
+ "use_rslora": false
33
+ }
llama2_13b_peft/cnn_dailymail/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91b54b9ab362c86ed5fa1b8aceba1d8f17d63dbc2ab01d716dab026362e2c096
3
+ size 125248064
llama2_13b_peft/cnn_dailymail/all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.39,
3
+ "eval_loss": 0.9910521507263184,
4
+ "eval_runtime": 121.9871,
5
+ "eval_samples_per_second": 12.296,
6
+ "eval_steps_per_second": 0.516,
7
+ "train_loss": 0.943028081445133,
8
+ "train_runtime": 7174.3764,
9
+ "train_samples_per_second": 11.848,
10
+ "train_steps_per_second": 0.495
11
+ }
llama2_13b_peft/cnn_dailymail/eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.39,
3
+ "eval_loss": 0.9910521507263184,
4
+ "eval_runtime": 121.9871,
5
+ "eval_samples_per_second": 12.296,
6
+ "eval_steps_per_second": 0.516
7
+ }
llama2_13b_peft/cnn_dailymail/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
llama2_13b_peft/cnn_dailymail/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
llama2_13b_peft/cnn_dailymail/tokenizer_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "LlamaTokenizer",
43
+ "unk_token": "<unk>",
44
+ "use_default_system_prompt": false
45
+ }
llama2_13b_peft/cnn_dailymail/train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.39,
3
+ "train_loss": 0.943028081445133,
4
+ "train_runtime": 7174.3764,
5
+ "train_samples_per_second": 11.848,
6
+ "train_steps_per_second": 0.495
7
+ }
llama2_13b_peft/cnn_dailymail/trainer_log.jsonl ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 3550, "loss": 1.6246, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5e-05, "epoch": 0.03, "percentage": 0.28, "elapsed_time": "0:01:01", "remaining_time": "6:03:32"}
2
+ {"current_steps": 20, "total_steps": 3550, "loss": 1.3935, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001, "epoch": 0.06, "percentage": 0.56, "elapsed_time": "0:01:58", "remaining_time": "5:47:35"}
3
+ {"current_steps": 30, "total_steps": 3550, "loss": 1.1048, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999801989839055e-05, "epoch": 0.08, "percentage": 0.85, "elapsed_time": "0:02:57", "remaining_time": "5:47:59"}
4
+ {"current_steps": 40, "total_steps": 3550, "loss": 1.0825, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.999207975039429e-05, "epoch": 0.11, "percentage": 1.13, "elapsed_time": "0:03:56", "remaining_time": "5:45:24"}
5
+ {"current_steps": 50, "total_steps": 3550, "loss": 1.0297, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.998218002649506e-05, "epoch": 0.14, "percentage": 1.41, "elapsed_time": "0:04:57", "remaining_time": "5:47:02"}
6
+ {"current_steps": 50, "total_steps": 3550, "loss": null, "eval_loss": 1.0375804901123047, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.14, "percentage": 1.41, "elapsed_time": "0:04:57", "remaining_time": "5:47:02"}
7
+ {"current_steps": 60, "total_steps": 3550, "loss": 1.0166, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.996832151079127e-05, "epoch": 0.17, "percentage": 1.69, "elapsed_time": "0:07:55", "remaining_time": "7:40:39"}
8
+ {"current_steps": 70, "total_steps": 3550, "loss": 1.0402, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.995050530093367e-05, "epoch": 0.2, "percentage": 1.97, "elapsed_time": "0:09:00", "remaining_time": "7:27:41"}
9
+ {"current_steps": 80, "total_steps": 3550, "loss": 1.0182, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.992873280803849e-05, "epoch": 0.23, "percentage": 2.25, "elapsed_time": "0:10:01", "remaining_time": "7:15:05"}
10
+ {"current_steps": 90, "total_steps": 3550, "loss": 1.0422, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.990300575657565e-05, "epoch": 0.25, "percentage": 2.54, "elapsed_time": "0:11:00", "remaining_time": "7:03:17"}
11
+ {"current_steps": 100, "total_steps": 3550, "loss": 1.0043, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.987332618423221e-05, "epoch": 0.28, "percentage": 2.82, "elapsed_time": "0:12:00", "remaining_time": "6:54:04"}
12
+ {"current_steps": 100, "total_steps": 3550, "loss": null, "eval_loss": 1.022628903388977, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.28, "percentage": 2.82, "elapsed_time": "0:12:00", "remaining_time": "6:54:04"}
13
+ {"current_steps": 110, "total_steps": 3550, "loss": 1.0384, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.983969644175091e-05, "epoch": 0.31, "percentage": 3.1, "elapsed_time": "0:14:59", "remaining_time": "7:48:47"}
14
+ {"current_steps": 120, "total_steps": 3550, "loss": 1.0326, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.980211919274407e-05, "epoch": 0.34, "percentage": 3.38, "elapsed_time": "0:15:57", "remaining_time": "7:36:06"}
15
+ {"current_steps": 130, "total_steps": 3550, "loss": 1.0133, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.976059741348251e-05, "epoch": 0.37, "percentage": 3.66, "elapsed_time": "0:17:01", "remaining_time": "7:27:54"}
16
+ {"current_steps": 140, "total_steps": 3550, "loss": 1.0129, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.971513439265991e-05, "epoch": 0.39, "percentage": 3.94, "elapsed_time": "0:18:05", "remaining_time": "7:20:33"}
17
+ {"current_steps": 150, "total_steps": 3550, "loss": 1.0241, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.96657337311323e-05, "epoch": 0.42, "percentage": 4.23, "elapsed_time": "0:19:07", "remaining_time": "7:13:40"}
18
+ {"current_steps": 150, "total_steps": 3550, "loss": null, "eval_loss": 1.0130971670150757, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.42, "percentage": 4.23, "elapsed_time": "0:19:07", "remaining_time": "7:13:40"}
19
+ {"current_steps": 160, "total_steps": 3550, "loss": 1.038, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.961239934163289e-05, "epoch": 0.45, "percentage": 4.51, "elapsed_time": "0:22:08", "remaining_time": "7:49:07"}
20
+ {"current_steps": 170, "total_steps": 3550, "loss": 1.0083, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.955513544846205e-05, "epoch": 0.48, "percentage": 4.79, "elapsed_time": "0:23:08", "remaining_time": "7:40:05"}
21
+ {"current_steps": 180, "total_steps": 3550, "loss": 0.9988, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.949394658715288e-05, "epoch": 0.51, "percentage": 5.07, "elapsed_time": "0:24:05", "remaining_time": "7:30:54"}
22
+ {"current_steps": 190, "total_steps": 3550, "loss": 1.0344, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.942883760411188e-05, "epoch": 0.54, "percentage": 5.35, "elapsed_time": "0:25:02", "remaining_time": "7:22:49"}
23
+ {"current_steps": 200, "total_steps": 3550, "loss": 1.0134, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.935981365623516e-05, "epoch": 0.56, "percentage": 5.63, "elapsed_time": "0:26:03", "remaining_time": "7:16:33"}
24
+ {"current_steps": 200, "total_steps": 3550, "loss": null, "eval_loss": 1.0082659721374512, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.56, "percentage": 5.63, "elapsed_time": "0:26:03", "remaining_time": "7:16:33"}
25
+ {"current_steps": 210, "total_steps": 3550, "loss": 1.0259, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.928688021049992e-05, "epoch": 0.59, "percentage": 5.92, "elapsed_time": "0:29:03", "remaining_time": "7:42:13"}
26
+ {"current_steps": 220, "total_steps": 3550, "loss": 1.0102, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.921004304353147e-05, "epoch": 0.62, "percentage": 6.2, "elapsed_time": "0:30:04", "remaining_time": "7:35:16"}
27
+ {"current_steps": 230, "total_steps": 3550, "loss": 0.9969, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.912930824114576e-05, "epoch": 0.65, "percentage": 6.48, "elapsed_time": "0:31:04", "remaining_time": "7:28:27"}
28
+ {"current_steps": 240, "total_steps": 3550, "loss": 0.9626, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.904468219786727e-05, "epoch": 0.68, "percentage": 6.76, "elapsed_time": "0:32:01", "remaining_time": "7:21:37"}
29
+ {"current_steps": 250, "total_steps": 3550, "loss": 1.0284, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.895617161642256e-05, "epoch": 0.7, "percentage": 7.04, "elapsed_time": "0:33:04", "remaining_time": "7:16:40"}
30
+ {"current_steps": 250, "total_steps": 3550, "loss": null, "eval_loss": 1.0040661096572876, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.7, "percentage": 7.04, "elapsed_time": "0:33:04", "remaining_time": "7:16:40"}
31
+ {"current_steps": 260, "total_steps": 3550, "loss": 0.9762, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.886378350720943e-05, "epoch": 0.73, "percentage": 7.32, "elapsed_time": "0:36:06", "remaining_time": "7:36:48"}
32
+ {"current_steps": 270, "total_steps": 3550, "loss": 1.0387, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.876752518774166e-05, "epoch": 0.76, "percentage": 7.61, "elapsed_time": "0:37:07", "remaining_time": "7:30:56"}
33
+ {"current_steps": 280, "total_steps": 3550, "loss": 0.9725, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.866740428206934e-05, "epoch": 0.79, "percentage": 7.89, "elapsed_time": "0:38:09", "remaining_time": "7:25:39"}
34
+ {"current_steps": 290, "total_steps": 3550, "loss": 1.0357, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.856342872017515e-05, "epoch": 0.82, "percentage": 8.17, "elapsed_time": "0:39:08", "remaining_time": "7:19:59"}
35
+ {"current_steps": 300, "total_steps": 3550, "loss": 1.0126, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.845560673734617e-05, "epoch": 0.85, "percentage": 8.45, "elapsed_time": "0:40:10", "remaining_time": "7:15:13"}
36
+ {"current_steps": 300, "total_steps": 3550, "loss": null, "eval_loss": 0.9973611831665039, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.85, "percentage": 8.45, "elapsed_time": "0:40:10", "remaining_time": "7:15:13"}
37
+ {"current_steps": 310, "total_steps": 3550, "loss": 0.9815, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.834394687352169e-05, "epoch": 0.87, "percentage": 8.73, "elapsed_time": "0:43:10", "remaining_time": "7:31:17"}
38
+ {"current_steps": 320, "total_steps": 3550, "loss": 0.9906, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.822845797261675e-05, "epoch": 0.9, "percentage": 9.01, "elapsed_time": "0:44:14", "remaining_time": "7:26:36"}
39
+ {"current_steps": 330, "total_steps": 3550, "loss": 1.0014, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.810914918182169e-05, "epoch": 0.93, "percentage": 9.3, "elapsed_time": "0:45:12", "remaining_time": "7:21:05"}
40
+ {"current_steps": 340, "total_steps": 3550, "loss": 0.9813, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.798602995087763e-05, "epoch": 0.96, "percentage": 9.58, "elapsed_time": "0:46:11", "remaining_time": "7:16:08"}
41
+ {"current_steps": 350, "total_steps": 3550, "loss": 0.9779, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.785911003132811e-05, "epoch": 0.99, "percentage": 9.86, "elapsed_time": "0:47:10", "remaining_time": "7:11:22"}
42
+ {"current_steps": 350, "total_steps": 3550, "loss": null, "eval_loss": 0.9921852350234985, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.99, "percentage": 9.86, "elapsed_time": "0:47:10", "remaining_time": "7:11:22"}
43
+ {"current_steps": 360, "total_steps": 3550, "loss": 0.9616, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.772839947574657e-05, "epoch": 1.01, "percentage": 10.14, "elapsed_time": "0:50:18", "remaining_time": "7:25:47"}
44
+ {"current_steps": 370, "total_steps": 3550, "loss": 0.9413, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.75939086369403e-05, "epoch": 1.04, "percentage": 10.42, "elapsed_time": "0:51:20", "remaining_time": "7:21:17"}
45
+ {"current_steps": 380, "total_steps": 3550, "loss": 0.9512, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.745564816713034e-05, "epoch": 1.07, "percentage": 10.7, "elapsed_time": "0:52:19", "remaining_time": "7:16:26"}
46
+ {"current_steps": 390, "total_steps": 3550, "loss": 0.935, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.731362901710783e-05, "epoch": 1.1, "percentage": 10.99, "elapsed_time": "0:53:21", "remaining_time": "7:12:20"}
47
+ {"current_steps": 400, "total_steps": 3550, "loss": 0.9239, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.71678624353667e-05, "epoch": 1.13, "percentage": 11.27, "elapsed_time": "0:54:19", "remaining_time": "7:07:51"}
48
+ {"current_steps": 400, "total_steps": 3550, "loss": null, "eval_loss": 0.9983939528465271, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.13, "percentage": 11.27, "elapsed_time": "0:54:19", "remaining_time": "7:07:51"}
49
+ {"current_steps": 410, "total_steps": 3550, "loss": 0.9265, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.701835996721265e-05, "epoch": 1.15, "percentage": 11.55, "elapsed_time": "0:57:22", "remaining_time": "7:19:28"}
50
+ {"current_steps": 420, "total_steps": 3550, "loss": 0.9361, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.686513345384882e-05, "epoch": 1.18, "percentage": 11.83, "elapsed_time": "0:58:29", "remaining_time": "7:15:52"}
51
+ {"current_steps": 430, "total_steps": 3550, "loss": 0.9043, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.67081950314378e-05, "epoch": 1.21, "percentage": 12.11, "elapsed_time": "0:59:24", "remaining_time": "7:11:01"}
52
+ {"current_steps": 440, "total_steps": 3550, "loss": 0.9006, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.654755713014052e-05, "epoch": 1.24, "percentage": 12.39, "elapsed_time": "1:00:21", "remaining_time": "7:06:36"}
53
+ {"current_steps": 450, "total_steps": 3550, "loss": 0.9456, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.638323247313166e-05, "epoch": 1.27, "percentage": 12.68, "elapsed_time": "1:01:20", "remaining_time": "7:02:31"}
54
+ {"current_steps": 450, "total_steps": 3550, "loss": null, "eval_loss": 0.9970971345901489, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.27, "percentage": 12.68, "elapsed_time": "1:01:20", "remaining_time": "7:02:31"}
55
+ {"current_steps": 460, "total_steps": 3550, "loss": 0.8814, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.621523407559193e-05, "epoch": 1.3, "percentage": 12.96, "elapsed_time": "1:04:20", "remaining_time": "7:12:11"}
56
+ {"current_steps": 470, "total_steps": 3550, "loss": 0.9459, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.604357524367722e-05, "epoch": 1.32, "percentage": 13.24, "elapsed_time": "1:05:22", "remaining_time": "7:08:22"}
57
+ {"current_steps": 480, "total_steps": 3550, "loss": 0.8911, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.586826957346473e-05, "epoch": 1.35, "percentage": 13.52, "elapsed_time": "1:06:21", "remaining_time": "7:04:26"}
58
+ {"current_steps": 490, "total_steps": 3550, "loss": 0.9142, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.568933094987601e-05, "epoch": 1.38, "percentage": 13.8, "elapsed_time": "1:07:16", "remaining_time": "7:00:07"}
59
+ {"current_steps": 500, "total_steps": 3550, "loss": 0.9064, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.550677354557734e-05, "epoch": 1.41, "percentage": 14.08, "elapsed_time": "1:08:15", "remaining_time": "6:56:20"}
60
+ {"current_steps": 500, "total_steps": 3550, "loss": null, "eval_loss": 0.9910521507263184, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.41, "percentage": 14.08, "elapsed_time": "1:08:15", "remaining_time": "6:56:20"}
61
+ {"current_steps": 510, "total_steps": 3550, "loss": 0.9389, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.532061181985714e-05, "epoch": 1.44, "percentage": 14.37, "elapsed_time": "1:11:14", "remaining_time": "7:04:37"}
62
+ {"current_steps": 520, "total_steps": 3550, "loss": 0.8996, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.513086051748068e-05, "epoch": 1.46, "percentage": 14.65, "elapsed_time": "1:12:16", "remaining_time": "7:01:07"}
63
+ {"current_steps": 530, "total_steps": 3550, "loss": 0.929, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.493753466752235e-05, "epoch": 1.49, "percentage": 14.93, "elapsed_time": "1:13:14", "remaining_time": "6:57:19"}
64
+ {"current_steps": 540, "total_steps": 3550, "loss": 0.9307, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.474064958217523e-05, "epoch": 1.52, "percentage": 15.21, "elapsed_time": "1:14:12", "remaining_time": "6:53:38"}
65
+ {"current_steps": 550, "total_steps": 3550, "loss": 0.9238, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.454022085553828e-05, "epoch": 1.55, "percentage": 15.49, "elapsed_time": "1:15:12", "remaining_time": "6:50:12"}
66
+ {"current_steps": 550, "total_steps": 3550, "loss": null, "eval_loss": 0.9884344935417175, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.55, "percentage": 15.49, "elapsed_time": "1:15:12", "remaining_time": "6:50:12"}
67
+ {"current_steps": 560, "total_steps": 3550, "loss": 0.9161, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.433626436238128e-05, "epoch": 1.58, "percentage": 15.77, "elapsed_time": "1:18:16", "remaining_time": "6:57:56"}
68
+ {"current_steps": 570, "total_steps": 3550, "loss": 0.8969, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.412879625688742e-05, "epoch": 1.61, "percentage": 16.06, "elapsed_time": "1:19:18", "remaining_time": "6:54:40"}
69
+ {"current_steps": 580, "total_steps": 3550, "loss": 0.9393, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.391783297137391e-05, "epoch": 1.63, "percentage": 16.34, "elapsed_time": "1:20:21", "remaining_time": "6:51:29"}
70
+ {"current_steps": 590, "total_steps": 3550, "loss": 0.9139, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.370339121499038e-05, "epoch": 1.66, "percentage": 16.62, "elapsed_time": "1:21:24", "remaining_time": "6:48:25"}
71
+ {"current_steps": 600, "total_steps": 3550, "loss": 0.9053, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.348548797239551e-05, "epoch": 1.69, "percentage": 16.9, "elapsed_time": "1:22:23", "remaining_time": "6:45:04"}
72
+ {"current_steps": 600, "total_steps": 3550, "loss": null, "eval_loss": 0.9863258600234985, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.69, "percentage": 16.9, "elapsed_time": "1:22:23", "remaining_time": "6:45:04"}
73
+ {"current_steps": 610, "total_steps": 3550, "loss": 0.9176, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.326414050241175e-05, "epoch": 1.72, "percentage": 17.18, "elapsed_time": "1:25:27", "remaining_time": "6:51:54"}
74
+ {"current_steps": 620, "total_steps": 3550, "loss": 0.9378, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.30393663366584e-05, "epoch": 1.75, "percentage": 17.46, "elapsed_time": "1:26:23", "remaining_time": "6:48:16"}
75
+ {"current_steps": 630, "total_steps": 3550, "loss": 0.9158, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.28111832781629e-05, "epoch": 1.77, "percentage": 17.75, "elapsed_time": "1:27:20", "remaining_time": "6:44:47"}
76
+ {"current_steps": 640, "total_steps": 3550, "loss": 0.9385, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.257960939995093e-05, "epoch": 1.8, "percentage": 18.03, "elapsed_time": "1:28:18", "remaining_time": "6:41:30"}
77
+ {"current_steps": 650, "total_steps": 3550, "loss": 0.9066, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.234466304361487e-05, "epoch": 1.83, "percentage": 18.31, "elapsed_time": "1:29:20", "remaining_time": "6:38:34"}
78
+ {"current_steps": 650, "total_steps": 3550, "loss": null, "eval_loss": 0.9829334616661072, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.83, "percentage": 18.31, "elapsed_time": "1:29:20", "remaining_time": "6:38:34"}
79
+ {"current_steps": 660, "total_steps": 3550, "loss": 0.9185, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.2106362817861e-05, "epoch": 1.86, "percentage": 18.59, "elapsed_time": "1:32:21", "remaining_time": "6:44:22"}
80
+ {"current_steps": 670, "total_steps": 3550, "loss": 0.9425, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.186472759703579e-05, "epoch": 1.89, "percentage": 18.87, "elapsed_time": "1:33:18", "remaining_time": "6:41:06"}
81
+ {"current_steps": 680, "total_steps": 3550, "loss": 0.9143, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.161977651963081e-05, "epoch": 1.92, "percentage": 19.15, "elapsed_time": "1:34:21", "remaining_time": "6:38:14"}
82
+ {"current_steps": 690, "total_steps": 3550, "loss": 0.8889, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.137152898676698e-05, "epoch": 1.94, "percentage": 19.44, "elapsed_time": "1:35:26", "remaining_time": "6:35:36"}
83
+ {"current_steps": 700, "total_steps": 3550, "loss": 0.932, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.112000466065784e-05, "epoch": 1.97, "percentage": 19.72, "elapsed_time": "1:36:24", "remaining_time": "6:32:33"}
84
+ {"current_steps": 700, "total_steps": 3550, "loss": null, "eval_loss": 0.9802341461181641, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.97, "percentage": 19.72, "elapsed_time": "1:36:24", "remaining_time": "6:32:33"}
85
+ {"current_steps": 710, "total_steps": 3550, "loss": 0.9121, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.086522346305233e-05, "epoch": 2.0, "percentage": 20.0, "elapsed_time": "1:39:23", "remaining_time": "6:37:33"}
86
+ {"current_steps": 720, "total_steps": 3550, "loss": 0.7502, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.060720557365683e-05, "epoch": 2.03, "percentage": 20.28, "elapsed_time": "1:40:28", "remaining_time": "6:34:54"}
87
+ {"current_steps": 730, "total_steps": 3550, "loss": 0.7308, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.034597142853684e-05, "epoch": 2.06, "percentage": 20.56, "elapsed_time": "1:41:28", "remaining_time": "6:31:59"}
88
+ {"current_steps": 740, "total_steps": 3550, "loss": 0.7424, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.008154171849844e-05, "epoch": 2.08, "percentage": 20.85, "elapsed_time": "1:42:26", "remaining_time": "6:29:00"}
89
+ {"current_steps": 750, "total_steps": 3550, "loss": 0.7339, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.981393738744939e-05, "epoch": 2.11, "percentage": 21.13, "elapsed_time": "1:43:27", "remaining_time": "6:26:13"}
90
+ {"current_steps": 750, "total_steps": 3550, "loss": null, "eval_loss": 1.0370149612426758, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.11, "percentage": 21.13, "elapsed_time": "1:43:27", "remaining_time": "6:26:13"}
91
+ {"current_steps": 760, "total_steps": 3550, "loss": 0.704, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.954317963074034e-05, "epoch": 2.14, "percentage": 21.41, "elapsed_time": "1:46:29", "remaining_time": "6:30:54"}
92
+ {"current_steps": 770, "total_steps": 3550, "loss": 0.7179, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.926928989348612e-05, "epoch": 2.17, "percentage": 21.69, "elapsed_time": "1:47:30", "remaining_time": "6:28:10"}
93
+ {"current_steps": 780, "total_steps": 3550, "loss": 0.7156, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.899228986886709e-05, "epoch": 2.2, "percentage": 21.97, "elapsed_time": "1:48:24", "remaining_time": "6:25:00"}
94
+ {"current_steps": 790, "total_steps": 3550, "loss": 0.7577, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.871220149641101e-05, "epoch": 2.23, "percentage": 22.25, "elapsed_time": "1:49:27", "remaining_time": "6:22:24"}
95
+ {"current_steps": 800, "total_steps": 3550, "loss": 0.7678, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.842904696025542e-05, "epoch": 2.25, "percentage": 22.54, "elapsed_time": "1:50:31", "remaining_time": "6:19:54"}
96
+ {"current_steps": 800, "total_steps": 3550, "loss": null, "eval_loss": 1.032098650932312, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.25, "percentage": 22.54, "elapsed_time": "1:50:31", "remaining_time": "6:19:54"}
97
+ {"current_steps": 810, "total_steps": 3550, "loss": 0.7659, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.814284868739038e-05, "epoch": 2.28, "percentage": 22.82, "elapsed_time": "1:53:31", "remaining_time": "6:24:02"}
98
+ {"current_steps": 820, "total_steps": 3550, "loss": 0.7443, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.785362934588233e-05, "epoch": 2.31, "percentage": 23.1, "elapsed_time": "1:54:29", "remaining_time": "6:21:11"}
99
+ {"current_steps": 830, "total_steps": 3550, "loss": 0.7736, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.756141184307864e-05, "epoch": 2.34, "percentage": 23.38, "elapsed_time": "1:55:29", "remaining_time": "6:18:28"}
100
+ {"current_steps": 840, "total_steps": 3550, "loss": 0.7527, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.726621932379319e-05, "epoch": 2.37, "percentage": 23.66, "elapsed_time": "1:56:28", "remaining_time": "6:15:47"}
101
+ {"current_steps": 850, "total_steps": 3550, "loss": 0.7538, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.696807516847328e-05, "epoch": 2.39, "percentage": 23.94, "elapsed_time": "1:57:27", "remaining_time": "6:13:06"}
102
+ {"current_steps": 850, "total_steps": 3550, "loss": null, "eval_loss": 1.0239675045013428, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.39, "percentage": 23.94, "elapsed_time": "1:57:27", "remaining_time": "6:13:06"}
103
+ {"current_steps": 850, "total_steps": 3550, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.39, "percentage": 23.94, "elapsed_time": "1:57:27", "remaining_time": "6:13:06"}
104
+ {"current_steps": 63, "total_steps": 63, "loss": null, "eval_loss": 0.9910521507263184, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.39, "percentage": 100.0, "elapsed_time": "2:01:44", "remaining_time": "0:00:00"}
llama2_13b_peft/cnn_dailymail/trainer_state.json ADDED
@@ -0,0 +1,761 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9910521507263184,
3
+ "best_model_checkpoint": "ckpt/llama2_13b_fuze15_no_sys/cnn_dailymail_no_sys/checkpoint-500",
4
+ "epoch": 2.3943661971830985,
5
+ "eval_steps": 50,
6
+ "global_step": 850,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03,
13
+ "grad_norm": 0.47264495491981506,
14
+ "learning_rate": 5e-05,
15
+ "loss": 1.6246,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.06,
20
+ "grad_norm": 0.349923312664032,
21
+ "learning_rate": 0.0001,
22
+ "loss": 1.3935,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.08,
27
+ "grad_norm": 0.3155190348625183,
28
+ "learning_rate": 9.999801989839055e-05,
29
+ "loss": 1.1048,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.11,
34
+ "grad_norm": 0.2674517333507538,
35
+ "learning_rate": 9.999207975039429e-05,
36
+ "loss": 1.0825,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.14,
41
+ "grad_norm": 0.2541864216327667,
42
+ "learning_rate": 9.998218002649506e-05,
43
+ "loss": 1.0297,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.14,
48
+ "eval_loss": 1.0375804901123047,
49
+ "eval_runtime": 122.2414,
50
+ "eval_samples_per_second": 12.271,
51
+ "eval_steps_per_second": 0.515,
52
+ "step": 50
53
+ },
54
+ {
55
+ "epoch": 0.17,
56
+ "grad_norm": 0.253811776638031,
57
+ "learning_rate": 9.996832151079127e-05,
58
+ "loss": 1.0166,
59
+ "step": 60
60
+ },
61
+ {
62
+ "epoch": 0.2,
63
+ "grad_norm": 0.2265736311674118,
64
+ "learning_rate": 9.995050530093367e-05,
65
+ "loss": 1.0402,
66
+ "step": 70
67
+ },
68
+ {
69
+ "epoch": 0.23,
70
+ "grad_norm": 0.37545114755630493,
71
+ "learning_rate": 9.992873280803849e-05,
72
+ "loss": 1.0182,
73
+ "step": 80
74
+ },
75
+ {
76
+ "epoch": 0.25,
77
+ "grad_norm": 0.2609192132949829,
78
+ "learning_rate": 9.990300575657565e-05,
79
+ "loss": 1.0422,
80
+ "step": 90
81
+ },
82
+ {
83
+ "epoch": 0.28,
84
+ "grad_norm": 0.28785914182662964,
85
+ "learning_rate": 9.987332618423221e-05,
86
+ "loss": 1.0043,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 0.28,
91
+ "eval_loss": 1.022628903388977,
92
+ "eval_runtime": 122.0796,
93
+ "eval_samples_per_second": 12.287,
94
+ "eval_steps_per_second": 0.516,
95
+ "step": 100
96
+ },
97
+ {
98
+ "epoch": 0.31,
99
+ "grad_norm": 0.26292186975479126,
100
+ "learning_rate": 9.983969644175091e-05,
101
+ "loss": 1.0384,
102
+ "step": 110
103
+ },
104
+ {
105
+ "epoch": 0.34,
106
+ "grad_norm": 0.2501932382583618,
107
+ "learning_rate": 9.980211919274407e-05,
108
+ "loss": 1.0326,
109
+ "step": 120
110
+ },
111
+ {
112
+ "epoch": 0.37,
113
+ "grad_norm": 0.2618853449821472,
114
+ "learning_rate": 9.976059741348251e-05,
115
+ "loss": 1.0133,
116
+ "step": 130
117
+ },
118
+ {
119
+ "epoch": 0.39,
120
+ "grad_norm": 0.27294716238975525,
121
+ "learning_rate": 9.971513439265991e-05,
122
+ "loss": 1.0129,
123
+ "step": 140
124
+ },
125
+ {
126
+ "epoch": 0.42,
127
+ "grad_norm": 0.27111512422561646,
128
+ "learning_rate": 9.96657337311323e-05,
129
+ "loss": 1.0241,
130
+ "step": 150
131
+ },
132
+ {
133
+ "epoch": 0.42,
134
+ "eval_loss": 1.0130971670150757,
135
+ "eval_runtime": 122.2912,
136
+ "eval_samples_per_second": 12.266,
137
+ "eval_steps_per_second": 0.515,
138
+ "step": 150
139
+ },
140
+ {
141
+ "epoch": 0.45,
142
+ "grad_norm": 0.30171358585357666,
143
+ "learning_rate": 9.961239934163289e-05,
144
+ "loss": 1.038,
145
+ "step": 160
146
+ },
147
+ {
148
+ "epoch": 0.48,
149
+ "grad_norm": 0.2853068709373474,
150
+ "learning_rate": 9.955513544846205e-05,
151
+ "loss": 1.0083,
152
+ "step": 170
153
+ },
154
+ {
155
+ "epoch": 0.51,
156
+ "grad_norm": 0.3023884892463684,
157
+ "learning_rate": 9.949394658715288e-05,
158
+ "loss": 0.9988,
159
+ "step": 180
160
+ },
161
+ {
162
+ "epoch": 0.54,
163
+ "grad_norm": 0.3108798563480377,
164
+ "learning_rate": 9.942883760411188e-05,
165
+ "loss": 1.0344,
166
+ "step": 190
167
+ },
168
+ {
169
+ "epoch": 0.56,
170
+ "grad_norm": 0.2736767530441284,
171
+ "learning_rate": 9.935981365623516e-05,
172
+ "loss": 1.0134,
173
+ "step": 200
174
+ },
175
+ {
176
+ "epoch": 0.56,
177
+ "eval_loss": 1.0082659721374512,
178
+ "eval_runtime": 122.0985,
179
+ "eval_samples_per_second": 12.285,
180
+ "eval_steps_per_second": 0.516,
181
+ "step": 200
182
+ },
183
+ {
184
+ "epoch": 0.59,
185
+ "grad_norm": 0.2804129123687744,
186
+ "learning_rate": 9.928688021049992e-05,
187
+ "loss": 1.0259,
188
+ "step": 210
189
+ },
190
+ {
191
+ "epoch": 0.62,
192
+ "grad_norm": 0.29631489515304565,
193
+ "learning_rate": 9.921004304353147e-05,
194
+ "loss": 1.0102,
195
+ "step": 220
196
+ },
197
+ {
198
+ "epoch": 0.65,
199
+ "grad_norm": 0.28678563237190247,
200
+ "learning_rate": 9.912930824114576e-05,
201
+ "loss": 0.9969,
202
+ "step": 230
203
+ },
204
+ {
205
+ "epoch": 0.68,
206
+ "grad_norm": 0.35824358463287354,
207
+ "learning_rate": 9.904468219786727e-05,
208
+ "loss": 0.9626,
209
+ "step": 240
210
+ },
211
+ {
212
+ "epoch": 0.7,
213
+ "grad_norm": 0.26744136214256287,
214
+ "learning_rate": 9.895617161642256e-05,
215
+ "loss": 1.0284,
216
+ "step": 250
217
+ },
218
+ {
219
+ "epoch": 0.7,
220
+ "eval_loss": 1.0040661096572876,
221
+ "eval_runtime": 122.3546,
222
+ "eval_samples_per_second": 12.259,
223
+ "eval_steps_per_second": 0.515,
224
+ "step": 250
225
+ },
226
+ {
227
+ "epoch": 0.73,
228
+ "grad_norm": 0.29996350407600403,
229
+ "learning_rate": 9.886378350720943e-05,
230
+ "loss": 0.9762,
231
+ "step": 260
232
+ },
233
+ {
234
+ "epoch": 0.76,
235
+ "grad_norm": 0.2882774770259857,
236
+ "learning_rate": 9.876752518774166e-05,
237
+ "loss": 1.0387,
238
+ "step": 270
239
+ },
240
+ {
241
+ "epoch": 0.79,
242
+ "grad_norm": 0.3027604818344116,
243
+ "learning_rate": 9.866740428206934e-05,
244
+ "loss": 0.9725,
245
+ "step": 280
246
+ },
247
+ {
248
+ "epoch": 0.82,
249
+ "grad_norm": 0.29195544123649597,
250
+ "learning_rate": 9.856342872017515e-05,
251
+ "loss": 1.0357,
252
+ "step": 290
253
+ },
254
+ {
255
+ "epoch": 0.85,
256
+ "grad_norm": 0.3052995800971985,
257
+ "learning_rate": 9.845560673734617e-05,
258
+ "loss": 1.0126,
259
+ "step": 300
260
+ },
261
+ {
262
+ "epoch": 0.85,
263
+ "eval_loss": 0.9973611831665039,
264
+ "eval_runtime": 122.3104,
265
+ "eval_samples_per_second": 12.264,
266
+ "eval_steps_per_second": 0.515,
267
+ "step": 300
268
+ },
269
+ {
270
+ "epoch": 0.87,
271
+ "grad_norm": 0.316039115190506,
272
+ "learning_rate": 9.834394687352169e-05,
273
+ "loss": 0.9815,
274
+ "step": 310
275
+ },
276
+ {
277
+ "epoch": 0.9,
278
+ "grad_norm": 0.3242320120334625,
279
+ "learning_rate": 9.822845797261675e-05,
280
+ "loss": 0.9906,
281
+ "step": 320
282
+ },
283
+ {
284
+ "epoch": 0.93,
285
+ "grad_norm": 0.29159247875213623,
286
+ "learning_rate": 9.810914918182169e-05,
287
+ "loss": 1.0014,
288
+ "step": 330
289
+ },
290
+ {
291
+ "epoch": 0.96,
292
+ "grad_norm": 0.33583688735961914,
293
+ "learning_rate": 9.798602995087763e-05,
294
+ "loss": 0.9813,
295
+ "step": 340
296
+ },
297
+ {
298
+ "epoch": 0.99,
299
+ "grad_norm": 0.29560449719429016,
300
+ "learning_rate": 9.785911003132811e-05,
301
+ "loss": 0.9779,
302
+ "step": 350
303
+ },
304
+ {
305
+ "epoch": 0.99,
306
+ "eval_loss": 0.9921852350234985,
307
+ "eval_runtime": 122.2577,
308
+ "eval_samples_per_second": 12.269,
309
+ "eval_steps_per_second": 0.515,
310
+ "step": 350
311
+ },
312
+ {
313
+ "epoch": 1.01,
314
+ "grad_norm": 0.3149906098842621,
315
+ "learning_rate": 9.772839947574657e-05,
316
+ "loss": 0.9616,
317
+ "step": 360
318
+ },
319
+ {
320
+ "epoch": 1.04,
321
+ "grad_norm": 0.3386111259460449,
322
+ "learning_rate": 9.75939086369403e-05,
323
+ "loss": 0.9413,
324
+ "step": 370
325
+ },
326
+ {
327
+ "epoch": 1.07,
328
+ "grad_norm": 0.39352336525917053,
329
+ "learning_rate": 9.745564816713034e-05,
330
+ "loss": 0.9512,
331
+ "step": 380
332
+ },
333
+ {
334
+ "epoch": 1.1,
335
+ "grad_norm": 0.3844590187072754,
336
+ "learning_rate": 9.731362901710783e-05,
337
+ "loss": 0.935,
338
+ "step": 390
339
+ },
340
+ {
341
+ "epoch": 1.13,
342
+ "grad_norm": 0.4934450387954712,
343
+ "learning_rate": 9.71678624353667e-05,
344
+ "loss": 0.9239,
345
+ "step": 400
346
+ },
347
+ {
348
+ "epoch": 1.13,
349
+ "eval_loss": 0.9983939528465271,
350
+ "eval_runtime": 122.3109,
351
+ "eval_samples_per_second": 12.264,
352
+ "eval_steps_per_second": 0.515,
353
+ "step": 400
354
+ },
355
+ {
356
+ "epoch": 1.15,
357
+ "grad_norm": 0.4922298192977905,
358
+ "learning_rate": 9.701835996721265e-05,
359
+ "loss": 0.9265,
360
+ "step": 410
361
+ },
362
+ {
363
+ "epoch": 1.18,
364
+ "grad_norm": 0.5618098974227905,
365
+ "learning_rate": 9.686513345384882e-05,
366
+ "loss": 0.9361,
367
+ "step": 420
368
+ },
369
+ {
370
+ "epoch": 1.21,
371
+ "grad_norm": 0.5378989577293396,
372
+ "learning_rate": 9.67081950314378e-05,
373
+ "loss": 0.9043,
374
+ "step": 430
375
+ },
376
+ {
377
+ "epoch": 1.24,
378
+ "grad_norm": 0.5351877808570862,
379
+ "learning_rate": 9.654755713014052e-05,
380
+ "loss": 0.9006,
381
+ "step": 440
382
+ },
383
+ {
384
+ "epoch": 1.27,
385
+ "grad_norm": 0.6081934571266174,
386
+ "learning_rate": 9.638323247313166e-05,
387
+ "loss": 0.9456,
388
+ "step": 450
389
+ },
390
+ {
391
+ "epoch": 1.27,
392
+ "eval_loss": 0.9970971345901489,
393
+ "eval_runtime": 122.1402,
394
+ "eval_samples_per_second": 12.281,
395
+ "eval_steps_per_second": 0.516,
396
+ "step": 450
397
+ },
398
+ {
399
+ "epoch": 1.3,
400
+ "grad_norm": 0.5924888253211975,
401
+ "learning_rate": 9.621523407559193e-05,
402
+ "loss": 0.8814,
403
+ "step": 460
404
+ },
405
+ {
406
+ "epoch": 1.32,
407
+ "grad_norm": 0.5622063279151917,
408
+ "learning_rate": 9.604357524367722e-05,
409
+ "loss": 0.9459,
410
+ "step": 470
411
+ },
412
+ {
413
+ "epoch": 1.35,
414
+ "grad_norm": 0.6370586156845093,
415
+ "learning_rate": 9.586826957346473e-05,
416
+ "loss": 0.8911,
417
+ "step": 480
418
+ },
419
+ {
420
+ "epoch": 1.38,
421
+ "grad_norm": 0.5782639384269714,
422
+ "learning_rate": 9.568933094987601e-05,
423
+ "loss": 0.9142,
424
+ "step": 490
425
+ },
426
+ {
427
+ "epoch": 1.41,
428
+ "grad_norm": 0.5522053837776184,
429
+ "learning_rate": 9.550677354557734e-05,
430
+ "loss": 0.9064,
431
+ "step": 500
432
+ },
433
+ {
434
+ "epoch": 1.41,
435
+ "eval_loss": 0.9910521507263184,
436
+ "eval_runtime": 122.2738,
437
+ "eval_samples_per_second": 12.268,
438
+ "eval_steps_per_second": 0.515,
439
+ "step": 500
440
+ },
441
+ {
442
+ "epoch": 1.44,
443
+ "grad_norm": 0.5490506887435913,
444
+ "learning_rate": 9.532061181985714e-05,
445
+ "loss": 0.9389,
446
+ "step": 510
447
+ },
448
+ {
449
+ "epoch": 1.46,
450
+ "grad_norm": 0.6112892627716064,
451
+ "learning_rate": 9.513086051748068e-05,
452
+ "loss": 0.8996,
453
+ "step": 520
454
+ },
455
+ {
456
+ "epoch": 1.49,
457
+ "grad_norm": 0.6104739308357239,
458
+ "learning_rate": 9.493753466752235e-05,
459
+ "loss": 0.929,
460
+ "step": 530
461
+ },
462
+ {
463
+ "epoch": 1.52,
464
+ "grad_norm": 0.5805090665817261,
465
+ "learning_rate": 9.474064958217523e-05,
466
+ "loss": 0.9307,
467
+ "step": 540
468
+ },
469
+ {
470
+ "epoch": 1.55,
471
+ "grad_norm": 0.5656021237373352,
472
+ "learning_rate": 9.454022085553828e-05,
473
+ "loss": 0.9238,
474
+ "step": 550
475
+ },
476
+ {
477
+ "epoch": 1.55,
478
+ "eval_loss": 0.9884344935417175,
479
+ "eval_runtime": 122.2227,
480
+ "eval_samples_per_second": 12.273,
481
+ "eval_steps_per_second": 0.515,
482
+ "step": 550
483
+ },
484
+ {
485
+ "epoch": 1.58,
486
+ "grad_norm": 0.6901003122329712,
487
+ "learning_rate": 9.433626436238128e-05,
488
+ "loss": 0.9161,
489
+ "step": 560
490
+ },
491
+ {
492
+ "epoch": 1.61,
493
+ "grad_norm": 0.5691778063774109,
494
+ "learning_rate": 9.412879625688742e-05,
495
+ "loss": 0.8969,
496
+ "step": 570
497
+ },
498
+ {
499
+ "epoch": 1.63,
500
+ "grad_norm": 0.5715755820274353,
501
+ "learning_rate": 9.391783297137391e-05,
502
+ "loss": 0.9393,
503
+ "step": 580
504
+ },
505
+ {
506
+ "epoch": 1.66,
507
+ "grad_norm": 0.616493284702301,
508
+ "learning_rate": 9.370339121499038e-05,
509
+ "loss": 0.9139,
510
+ "step": 590
511
+ },
512
+ {
513
+ "epoch": 1.69,
514
+ "grad_norm": 0.6331434845924377,
515
+ "learning_rate": 9.348548797239551e-05,
516
+ "loss": 0.9053,
517
+ "step": 600
518
+ },
519
+ {
520
+ "epoch": 1.69,
521
+ "eval_loss": 0.9863258600234985,
522
+ "eval_runtime": 122.2709,
523
+ "eval_samples_per_second": 12.268,
524
+ "eval_steps_per_second": 0.515,
525
+ "step": 600
526
+ },
527
+ {
528
+ "epoch": 1.72,
529
+ "grad_norm": 0.5978617668151855,
530
+ "learning_rate": 9.326414050241175e-05,
531
+ "loss": 0.9176,
532
+ "step": 610
533
+ },
534
+ {
535
+ "epoch": 1.75,
536
+ "grad_norm": 0.6102743744850159,
537
+ "learning_rate": 9.30393663366584e-05,
538
+ "loss": 0.9378,
539
+ "step": 620
540
+ },
541
+ {
542
+ "epoch": 1.77,
543
+ "grad_norm": 0.6495214104652405,
544
+ "learning_rate": 9.28111832781629e-05,
545
+ "loss": 0.9158,
546
+ "step": 630
547
+ },
548
+ {
549
+ "epoch": 1.8,
550
+ "grad_norm": 0.6791807413101196,
551
+ "learning_rate": 9.257960939995093e-05,
552
+ "loss": 0.9385,
553
+ "step": 640
554
+ },
555
+ {
556
+ "epoch": 1.83,
557
+ "grad_norm": 0.6373294591903687,
558
+ "learning_rate": 9.234466304361487e-05,
559
+ "loss": 0.9066,
560
+ "step": 650
561
+ },
562
+ {
563
+ "epoch": 1.83,
564
+ "eval_loss": 0.9829334616661072,
565
+ "eval_runtime": 122.4062,
566
+ "eval_samples_per_second": 12.254,
567
+ "eval_steps_per_second": 0.515,
568
+ "step": 650
569
+ },
570
+ {
571
+ "epoch": 1.86,
572
+ "grad_norm": 0.6747245788574219,
573
+ "learning_rate": 9.2106362817861e-05,
574
+ "loss": 0.9185,
575
+ "step": 660
576
+ },
577
+ {
578
+ "epoch": 1.89,
579
+ "grad_norm": 0.6707281470298767,
580
+ "learning_rate": 9.186472759703579e-05,
581
+ "loss": 0.9425,
582
+ "step": 670
583
+ },
584
+ {
585
+ "epoch": 1.92,
586
+ "grad_norm": 0.6029950976371765,
587
+ "learning_rate": 9.161977651963081e-05,
588
+ "loss": 0.9143,
589
+ "step": 680
590
+ },
591
+ {
592
+ "epoch": 1.94,
593
+ "grad_norm": 0.6832878589630127,
594
+ "learning_rate": 9.137152898676698e-05,
595
+ "loss": 0.8889,
596
+ "step": 690
597
+ },
598
+ {
599
+ "epoch": 1.97,
600
+ "grad_norm": 0.6552910804748535,
601
+ "learning_rate": 9.112000466065784e-05,
602
+ "loss": 0.932,
603
+ "step": 700
604
+ },
605
+ {
606
+ "epoch": 1.97,
607
+ "eval_loss": 0.9802341461181641,
608
+ "eval_runtime": 122.2282,
609
+ "eval_samples_per_second": 12.272,
610
+ "eval_steps_per_second": 0.515,
611
+ "step": 700
612
+ },
613
+ {
614
+ "epoch": 2.0,
615
+ "grad_norm": 0.5472111701965332,
616
+ "learning_rate": 9.086522346305233e-05,
617
+ "loss": 0.9121,
618
+ "step": 710
619
+ },
620
+ {
621
+ "epoch": 2.03,
622
+ "grad_norm": 0.8331531286239624,
623
+ "learning_rate": 9.060720557365683e-05,
624
+ "loss": 0.7502,
625
+ "step": 720
626
+ },
627
+ {
628
+ "epoch": 2.06,
629
+ "grad_norm": 0.7888972163200378,
630
+ "learning_rate": 9.034597142853684e-05,
631
+ "loss": 0.7308,
632
+ "step": 730
633
+ },
634
+ {
635
+ "epoch": 2.08,
636
+ "grad_norm": 1.116204023361206,
637
+ "learning_rate": 9.008154171849844e-05,
638
+ "loss": 0.7424,
639
+ "step": 740
640
+ },
641
+ {
642
+ "epoch": 2.11,
643
+ "grad_norm": 1.0302730798721313,
644
+ "learning_rate": 8.981393738744939e-05,
645
+ "loss": 0.7339,
646
+ "step": 750
647
+ },
648
+ {
649
+ "epoch": 2.11,
650
+ "eval_loss": 1.0370149612426758,
651
+ "eval_runtime": 122.3036,
652
+ "eval_samples_per_second": 12.265,
653
+ "eval_steps_per_second": 0.515,
654
+ "step": 750
655
+ },
656
+ {
657
+ "epoch": 2.14,
658
+ "grad_norm": 0.8039250373840332,
659
+ "learning_rate": 8.954317963074034e-05,
660
+ "loss": 0.704,
661
+ "step": 760
662
+ },
663
+ {
664
+ "epoch": 2.17,
665
+ "grad_norm": 0.9725390076637268,
666
+ "learning_rate": 8.926928989348612e-05,
667
+ "loss": 0.7179,
668
+ "step": 770
669
+ },
670
+ {
671
+ "epoch": 2.2,
672
+ "grad_norm": 1.0024648904800415,
673
+ "learning_rate": 8.899228986886709e-05,
674
+ "loss": 0.7156,
675
+ "step": 780
676
+ },
677
+ {
678
+ "epoch": 2.23,
679
+ "grad_norm": 1.0251855850219727,
680
+ "learning_rate": 8.871220149641101e-05,
681
+ "loss": 0.7577,
682
+ "step": 790
683
+ },
684
+ {
685
+ "epoch": 2.25,
686
+ "grad_norm": 1.0738469362258911,
687
+ "learning_rate": 8.842904696025542e-05,
688
+ "loss": 0.7678,
689
+ "step": 800
690
+ },
691
+ {
692
+ "epoch": 2.25,
693
+ "eval_loss": 1.032098650932312,
694
+ "eval_runtime": 122.3453,
695
+ "eval_samples_per_second": 12.26,
696
+ "eval_steps_per_second": 0.515,
697
+ "step": 800
698
+ },
699
+ {
700
+ "epoch": 2.28,
701
+ "grad_norm": 0.9893040060997009,
702
+ "learning_rate": 8.814284868739038e-05,
703
+ "loss": 0.7659,
704
+ "step": 810
705
+ },
706
+ {
707
+ "epoch": 2.31,
708
+ "grad_norm": 1.0190677642822266,
709
+ "learning_rate": 8.785362934588233e-05,
710
+ "loss": 0.7443,
711
+ "step": 820
712
+ },
713
+ {
714
+ "epoch": 2.34,
715
+ "grad_norm": 0.9342036247253418,
716
+ "learning_rate": 8.756141184307864e-05,
717
+ "loss": 0.7736,
718
+ "step": 830
719
+ },
720
+ {
721
+ "epoch": 2.37,
722
+ "grad_norm": 0.9595410823822021,
723
+ "learning_rate": 8.726621932379319e-05,
724
+ "loss": 0.7527,
725
+ "step": 840
726
+ },
727
+ {
728
+ "epoch": 2.39,
729
+ "grad_norm": 1.057177186012268,
730
+ "learning_rate": 8.696807516847328e-05,
731
+ "loss": 0.7538,
732
+ "step": 850
733
+ },
734
+ {
735
+ "epoch": 2.39,
736
+ "eval_loss": 1.0239675045013428,
737
+ "eval_runtime": 122.2319,
738
+ "eval_samples_per_second": 12.272,
739
+ "eval_steps_per_second": 0.515,
740
+ "step": 850
741
+ },
742
+ {
743
+ "epoch": 2.39,
744
+ "step": 850,
745
+ "total_flos": 2.7031368943670395e+18,
746
+ "train_loss": 0.943028081445133,
747
+ "train_runtime": 7174.3764,
748
+ "train_samples_per_second": 11.848,
749
+ "train_steps_per_second": 0.495
750
+ }
751
+ ],
752
+ "logging_steps": 10,
753
+ "max_steps": 3550,
754
+ "num_input_tokens_seen": 0,
755
+ "num_train_epochs": 10,
756
+ "save_steps": 500,
757
+ "total_flos": 2.7031368943670395e+18,
758
+ "train_batch_size": 8,
759
+ "trial_name": null,
760
+ "trial_params": null
761
+ }
llama2_13b_peft/cnn_dailymail/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98b719af2fd3d7324911d57e3cefb5f31c9c0c5b7c26baed7a0a99cadff76d55
3
+ size 5112
llama2_13b_peft/cnn_dailymail/training_eval_loss.png ADDED
llama2_13b_peft/cnn_dailymail/training_loss.png ADDED
llama2_13b_peft/contextual_parametric_knowledge_conflicts/README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - llama-factory
6
+ - lora
7
+ - generated_from_trainer
8
+ base_model: /data1/model/llama2/meta-llama/Llama2-13b
9
+ model-index:
10
+ - name: contextual_parametric_knowledge_conflicts_no_sys
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # contextual_parametric_knowledge_conflicts_no_sys
18
+
19
+ This model is a fine-tuned version of [/data1/model/llama2/meta-llama/Llama2-13b](https://huggingface.co//data1/model/llama2/meta-llama/Llama2-13b) on the contextual_parametric_knowledge_conflicts_no_sys dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.0000
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 0.0001
41
+ - train_batch_size: 8
42
+ - eval_batch_size: 8
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 4
46
+ - total_train_batch_size: 32
47
+ - total_eval_batch_size: 32
48
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
+ - lr_scheduler_type: cosine
50
+ - lr_scheduler_warmup_steps: 20
51
+ - num_epochs: 10.0
52
+
53
+ ### Training results
54
+
55
+ | Training Loss | Epoch | Step | Validation Loss |
56
+ |:-------------:|:-----:|:----:|:---------------:|
57
+ | 0.0098 | 0.13 | 50 | 0.0022 |
58
+ | 0.0023 | 0.27 | 100 | 0.0005 |
59
+ | 0.0002 | 0.4 | 150 | 0.0001 |
60
+ | 0.0 | 0.54 | 200 | 0.0000 |
61
+ | 0.0 | 0.67 | 250 | 0.0000 |
62
+ | 0.0006 | 0.8 | 300 | 0.0001 |
63
+ | 0.0 | 0.94 | 350 | 0.0000 |
64
+ | 0.0 | 1.07 | 400 | 0.0001 |
65
+ | 0.0 | 1.21 | 450 | 0.0000 |
66
+ | 0.0 | 1.34 | 500 | 0.0000 |
67
+ | 0.0 | 1.47 | 550 | 0.0000 |
68
+ | 0.0 | 1.61 | 600 | 0.0000 |
69
+ | 0.0 | 1.74 | 650 | 0.0000 |
70
+
71
+
72
+ ### Framework versions
73
+
74
+ - PEFT 0.9.0
75
+ - Transformers 4.38.2
76
+ - Pytorch 2.2.1
77
+ - Datasets 2.18.0
78
+ - Tokenizers 0.15.2
llama2_13b_peft/contextual_parametric_knowledge_conflicts/adapter_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/data1/model/llama2/meta-llama/Llama2-13b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.0,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 8,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "up_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "gate_proj",
26
+ "o_proj",
27
+ "v_proj",
28
+ "k_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "use_dora": false,
32
+ "use_rslora": false
33
+ }
llama2_13b_peft/contextual_parametric_knowledge_conflicts/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ff9f6f4864adea593d264b78cd225dd6a9a69e45ec9624910470fe96c5b9aa0
3
+ size 125248064
llama2_13b_peft/contextual_parametric_knowledge_conflicts/all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.74,
3
+ "eval_loss": 4.564080882119015e-05,
4
+ "eval_runtime": 36.4056,
5
+ "eval_samples_per_second": 57.793,
6
+ "eval_steps_per_second": 1.813,
7
+ "train_loss": 0.0704553289199248,
8
+ "train_runtime": 1752.1107,
9
+ "train_samples_per_second": 68.027,
10
+ "train_steps_per_second": 2.129
11
+ }
llama2_13b_peft/contextual_parametric_knowledge_conflicts/eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.74,
3
+ "eval_loss": 4.564080882119015e-05,
4
+ "eval_runtime": 36.4056,
5
+ "eval_samples_per_second": 57.793,
6
+ "eval_steps_per_second": 1.813
7
+ }
llama2_13b_peft/contextual_parametric_knowledge_conflicts/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
llama2_13b_peft/contextual_parametric_knowledge_conflicts/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
llama2_13b_peft/contextual_parametric_knowledge_conflicts/tokenizer_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "LlamaTokenizer",
43
+ "unk_token": "<unk>",
44
+ "use_default_system_prompt": false
45
+ }