ParagonLight commited on
Commit
c6dd428
1 Parent(s): 33f3da9

update 28 tasks lora adapters

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. llama2_13b_peft/linguistics_puzzles/README.md +74 -0
  2. llama2_13b_peft/linguistics_puzzles/adapter_config.json +34 -0
  3. llama2_13b_peft/linguistics_puzzles/adapter_model.safetensors +3 -0
  4. llama2_13b_peft/linguistics_puzzles/all_results.json +12 -0
  5. llama2_13b_peft/linguistics_puzzles/eval_results.json +7 -0
  6. llama2_13b_peft/linguistics_puzzles/special_tokens_map.json +24 -0
  7. llama2_13b_peft/linguistics_puzzles/tokenizer.model +3 -0
  8. llama2_13b_peft/linguistics_puzzles/tokenizer_config.json +45 -0
  9. llama2_13b_peft/linguistics_puzzles/train_results.json +8 -0
  10. llama2_13b_peft/linguistics_puzzles/trainer_log.jsonl +106 -0
  11. llama2_13b_peft/linguistics_puzzles/trainer_state.json +767 -0
  12. llama2_13b_peft/linguistics_puzzles/training_args.bin +3 -0
  13. llama2_13b_peft/linguistics_puzzles/training_eval_loss.png +0 -0
  14. llama2_13b_peft/linguistics_puzzles/training_loss.png +0 -0
  15. llama2_13b_peft/news_commentary_de/README.md +85 -0
  16. llama2_13b_peft/news_commentary_de/adapter_config.json +33 -0
  17. llama2_13b_peft/news_commentary_de/adapter_model.safetensors +3 -0
  18. llama2_13b_peft/news_commentary_de/all_results.json +11 -0
  19. llama2_13b_peft/news_commentary_de/eval_results.json +7 -0
  20. llama2_13b_peft/news_commentary_de/special_tokens_map.json +24 -0
  21. llama2_13b_peft/news_commentary_de/tokenizer.model +3 -0
  22. llama2_13b_peft/news_commentary_de/tokenizer_config.json +45 -0
  23. llama2_13b_peft/news_commentary_de/train_results.json +7 -0
  24. llama2_13b_peft/news_commentary_de/trainer_log.jsonl +0 -0
  25. llama2_13b_peft/news_commentary_de/trainer_state.json +2990 -0
  26. llama2_13b_peft/news_commentary_de/training_args.bin +3 -0
  27. llama2_13b_peft/news_commentary_de/training_eval_loss.png +0 -0
  28. llama2_13b_peft/news_commentary_de/training_loss.png +0 -0
  29. llama2_13b_peft/news_commentary_it/README.md +67 -0
  30. llama2_13b_peft/news_commentary_it/adapter_config.json +34 -0
  31. llama2_13b_peft/news_commentary_it/adapter_model.safetensors +3 -0
  32. llama2_13b_peft/news_commentary_it/all_results.json +12 -0
  33. llama2_13b_peft/news_commentary_it/eval_results.json +7 -0
  34. llama2_13b_peft/news_commentary_it/special_tokens_map.json +24 -0
  35. llama2_13b_peft/news_commentary_it/tokenizer.model +3 -0
  36. llama2_13b_peft/news_commentary_it/tokenizer_config.json +45 -0
  37. llama2_13b_peft/news_commentary_it/train_results.json +8 -0
  38. llama2_13b_peft/news_commentary_it/trainer_log.jsonl +0 -0
  39. llama2_13b_peft/news_commentary_it/trainer_state.json +0 -0
  40. llama2_13b_peft/news_commentary_it/training_args.bin +3 -0
  41. llama2_13b_peft/news_commentary_it/training_eval_loss.png +0 -0
  42. llama2_13b_peft/news_commentary_it/training_loss.png +0 -0
  43. llama2_13b_peft/topical_chat/README.md +89 -0
  44. llama2_13b_peft/topical_chat/adapter_config.json +34 -0
  45. llama2_13b_peft/topical_chat/adapter_model.safetensors +3 -0
  46. llama2_13b_peft/topical_chat/all_results.json +12 -0
  47. llama2_13b_peft/topical_chat/eval_results.json +7 -0
  48. llama2_13b_peft/topical_chat/special_tokens_map.json +24 -0
  49. llama2_13b_peft/topical_chat/tokenizer.model +3 -0
  50. llama2_13b_peft/topical_chat/tokenizer_config.json +45 -0
llama2_13b_peft/linguistics_puzzles/README.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - llama-factory
6
+ - lora
7
+ - generated_from_trainer
8
+ base_model: /data1/model/llama2/meta-llama/Llama2-13b
9
+ model-index:
10
+ - name: linguistics_puzzles_no_sys
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # linguistics_puzzles_no_sys
18
+
19
+ This model is a fine-tuned version of [/data1/model/llama2/meta-llama/Llama2-13b](https://huggingface.co//data1/model/llama2/meta-llama/Llama2-13b) on the linguistics_puzzles_no_sys dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.5924
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 5e-05
41
+ - train_batch_size: 4
42
+ - eval_batch_size: 4
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 2
46
+ - total_train_batch_size: 8
47
+ - total_eval_batch_size: 8
48
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
+ - lr_scheduler_type: cosine
50
+ - lr_scheduler_warmup_steps: 20
51
+ - num_epochs: 5.0
52
+
53
+ ### Training results
54
+
55
+ | Training Loss | Epoch | Step | Validation Loss |
56
+ |:-------------:|:------:|:----:|:---------------:|
57
+ | 1.1276 | 0.5263 | 100 | 1.0876 |
58
+ | 0.8128 | 1.0526 | 200 | 0.8153 |
59
+ | 0.6705 | 1.5789 | 300 | 0.6892 |
60
+ | 0.4876 | 2.1053 | 400 | 0.6225 |
61
+ | 0.4435 | 2.6316 | 500 | 0.5924 |
62
+ | 0.2743 | 3.1579 | 600 | 0.6151 |
63
+ | 0.2846 | 3.6842 | 700 | 0.6084 |
64
+ | 0.2069 | 4.2105 | 800 | 0.6427 |
65
+ | 0.172 | 4.7368 | 900 | 0.6495 |
66
+
67
+
68
+ ### Framework versions
69
+
70
+ - PEFT 0.10.0
71
+ - Transformers 4.40.0
72
+ - Pytorch 2.2.1
73
+ - Datasets 2.18.0
74
+ - Tokenizers 0.19.1
llama2_13b_peft/linguistics_puzzles/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/data1/model/llama2/meta-llama/Llama2-13b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "k_proj",
25
+ "q_proj",
26
+ "up_proj",
27
+ "gate_proj",
28
+ "v_proj",
29
+ "down_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
llama2_13b_peft/linguistics_puzzles/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef6dff2a000b0e4ef6a4db9d169cbaa257c1423e45fae8ef4f428ba9852e00f5
3
+ size 125248064
llama2_13b_peft/linguistics_puzzles/all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_loss": 0.5924356579780579,
4
+ "eval_runtime": 1.9025,
5
+ "eval_samples_per_second": 42.05,
6
+ "eval_steps_per_second": 5.256,
7
+ "total_flos": 2.0275085174217114e+17,
8
+ "train_loss": 0.5822552880487945,
9
+ "train_runtime": 660.0352,
10
+ "train_samples_per_second": 11.515,
11
+ "train_steps_per_second": 1.439
12
+ }
llama2_13b_peft/linguistics_puzzles/eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_loss": 0.5924356579780579,
4
+ "eval_runtime": 1.9025,
5
+ "eval_samples_per_second": 42.05,
6
+ "eval_steps_per_second": 5.256
7
+ }
llama2_13b_peft/linguistics_puzzles/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
llama2_13b_peft/linguistics_puzzles/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
llama2_13b_peft/linguistics_puzzles/tokenizer_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "LlamaTokenizer",
43
+ "unk_token": "<unk>",
44
+ "use_default_system_prompt": false
45
+ }
llama2_13b_peft/linguistics_puzzles/train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "total_flos": 2.0275085174217114e+17,
4
+ "train_loss": 0.5822552880487945,
5
+ "train_runtime": 660.0352,
6
+ "train_samples_per_second": 11.515,
7
+ "train_steps_per_second": 1.439
8
+ }
llama2_13b_peft/linguistics_puzzles/trainer_log.jsonl ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 950, "loss": 2.5922, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.5e-05, "epoch": 0.05263157894736842, "percentage": 1.05, "elapsed_time": "0:00:07", "remaining_time": "0:12:31"}
2
+ {"current_steps": 20, "total_steps": 950, "loss": 2.3206, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 5e-05, "epoch": 0.10526315789473684, "percentage": 2.11, "elapsed_time": "0:00:14", "remaining_time": "0:11:00"}
3
+ {"current_steps": 30, "total_steps": 950, "loss": 1.7229, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.998573727324295e-05, "epoch": 0.15789473684210525, "percentage": 3.16, "elapsed_time": "0:00:20", "remaining_time": "0:10:32"}
4
+ {"current_steps": 40, "total_steps": 950, "loss": 1.3729, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.994296536700177e-05, "epoch": 0.21052631578947367, "percentage": 4.21, "elapsed_time": "0:00:26", "remaining_time": "0:10:14"}
5
+ {"current_steps": 50, "total_steps": 950, "loss": 1.3635, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.987173308479738e-05, "epoch": 0.2631578947368421, "percentage": 5.26, "elapsed_time": "0:00:33", "remaining_time": "0:10:10"}
6
+ {"current_steps": 60, "total_steps": 950, "loss": 1.3315, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.977212170395598e-05, "epoch": 0.3157894736842105, "percentage": 6.32, "elapsed_time": "0:00:40", "remaining_time": "0:10:00"}
7
+ {"current_steps": 70, "total_steps": 950, "loss": 1.2515, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.964424488287009e-05, "epoch": 0.3684210526315789, "percentage": 7.37, "elapsed_time": "0:00:46", "remaining_time": "0:09:49"}
8
+ {"current_steps": 80, "total_steps": 950, "loss": 1.1872, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.948824853131236e-05, "epoch": 0.42105263157894735, "percentage": 8.42, "elapsed_time": "0:00:53", "remaining_time": "0:09:43"}
9
+ {"current_steps": 90, "total_steps": 950, "loss": 1.1552, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.930431064394977e-05, "epoch": 0.47368421052631576, "percentage": 9.47, "elapsed_time": "0:01:00", "remaining_time": "0:09:34"}
10
+ {"current_steps": 100, "total_steps": 950, "loss": 1.1276, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.909264109724853e-05, "epoch": 0.5263157894736842, "percentage": 10.53, "elapsed_time": "0:01:06", "remaining_time": "0:09:27"}
11
+ {"current_steps": 100, "total_steps": 950, "loss": null, "eval_loss": 1.0876480340957642, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 0.5263157894736842, "percentage": 10.53, "elapsed_time": "0:01:06", "remaining_time": "0:09:27"}
12
+ {"current_steps": 110, "total_steps": 950, "loss": 1.1756, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.885348141000122e-05, "epoch": 0.5789473684210527, "percentage": 11.58, "elapsed_time": "0:01:15", "remaining_time": "0:09:37"}
13
+ {"current_steps": 120, "total_steps": 950, "loss": 1.1106, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.858710446774951e-05, "epoch": 0.631578947368421, "percentage": 12.63, "elapsed_time": "0:01:22", "remaining_time": "0:09:29"}
14
+ {"current_steps": 130, "total_steps": 950, "loss": 1.0175, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.829381421141671e-05, "epoch": 0.6842105263157895, "percentage": 13.68, "elapsed_time": "0:01:28", "remaining_time": "0:09:19"}
15
+ {"current_steps": 140, "total_steps": 950, "loss": 0.9733, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.7973945290505766e-05, "epoch": 0.7368421052631579, "percentage": 14.74, "elapsed_time": "0:01:35", "remaining_time": "0:09:10"}
16
+ {"current_steps": 150, "total_steps": 950, "loss": 0.9907, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.7627862681258037e-05, "epoch": 0.7894736842105263, "percentage": 15.79, "elapsed_time": "0:01:41", "remaining_time": "0:09:03"}
17
+ {"current_steps": 160, "total_steps": 950, "loss": 0.9312, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.725596127020879e-05, "epoch": 0.8421052631578947, "percentage": 16.84, "elapsed_time": "0:01:48", "remaining_time": "0:08:54"}
18
+ {"current_steps": 170, "total_steps": 950, "loss": 0.9586, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.685866540361456e-05, "epoch": 0.8947368421052632, "percentage": 17.89, "elapsed_time": "0:01:55", "remaining_time": "0:08:47"}
19
+ {"current_steps": 180, "total_steps": 950, "loss": 0.9595, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.643642840326627e-05, "epoch": 0.9473684210526315, "percentage": 18.95, "elapsed_time": "0:02:01", "remaining_time": "0:08:39"}
20
+ {"current_steps": 190, "total_steps": 950, "loss": 0.8331, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.598973204924097e-05, "epoch": 1.0, "percentage": 20.0, "elapsed_time": "0:02:08", "remaining_time": "0:08:32"}
21
+ {"current_steps": 200, "total_steps": 950, "loss": 0.8128, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.551908603018191e-05, "epoch": 1.0526315789473684, "percentage": 21.05, "elapsed_time": "0:02:14", "remaining_time": "0:08:25"}
22
+ {"current_steps": 200, "total_steps": 950, "loss": null, "eval_loss": 0.8153461217880249, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 1.0526315789473684, "percentage": 21.05, "elapsed_time": "0:02:14", "remaining_time": "0:08:25"}
23
+ {"current_steps": 210, "total_steps": 950, "loss": 0.8186, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.502502736173462e-05, "epoch": 1.1052631578947367, "percentage": 22.11, "elapsed_time": "0:02:23", "remaining_time": "0:08:26"}
24
+ {"current_steps": 220, "total_steps": 950, "loss": 0.6895, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.45081197738023e-05, "epoch": 1.1578947368421053, "percentage": 23.16, "elapsed_time": "0:02:30", "remaining_time": "0:08:18"}
25
+ {"current_steps": 230, "total_steps": 950, "loss": 0.7901, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.3968953067319777e-05, "epoch": 1.2105263157894737, "percentage": 24.21, "elapsed_time": "0:02:36", "remaining_time": "0:08:10"}
26
+ {"current_steps": 240, "total_steps": 950, "loss": 0.704, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.340814244127993e-05, "epoch": 1.263157894736842, "percentage": 25.26, "elapsed_time": "0:02:43", "remaining_time": "0:08:03"}
27
+ {"current_steps": 250, "total_steps": 950, "loss": 0.6879, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.282632779078051e-05, "epoch": 1.3157894736842106, "percentage": 26.32, "elapsed_time": "0:02:49", "remaining_time": "0:07:55"}
28
+ {"current_steps": 260, "total_steps": 950, "loss": 0.7563, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.222417297689217e-05, "epoch": 1.368421052631579, "percentage": 27.37, "elapsed_time": "0:02:56", "remaining_time": "0:07:48"}
29
+ {"current_steps": 270, "total_steps": 950, "loss": 0.6846, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.160236506918098e-05, "epoch": 1.4210526315789473, "percentage": 28.42, "elapsed_time": "0:03:03", "remaining_time": "0:07:41"}
30
+ {"current_steps": 280, "total_steps": 950, "loss": 0.7155, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.096161356174959e-05, "epoch": 1.4736842105263157, "percentage": 29.47, "elapsed_time": "0:03:09", "remaining_time": "0:07:34"}
31
+ {"current_steps": 290, "total_steps": 950, "loss": 0.8037, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.030264956369157e-05, "epoch": 1.526315789473684, "percentage": 30.53, "elapsed_time": "0:03:16", "remaining_time": "0:07:26"}
32
+ {"current_steps": 300, "total_steps": 950, "loss": 0.6705, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.962622496488269e-05, "epoch": 1.5789473684210527, "percentage": 31.58, "elapsed_time": "0:03:22", "remaining_time": "0:07:19"}
33
+ {"current_steps": 300, "total_steps": 950, "loss": null, "eval_loss": 0.6891714930534363, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 1.5789473684210527, "percentage": 31.58, "elapsed_time": "0:03:22", "remaining_time": "0:07:19"}
34
+ {"current_steps": 310, "total_steps": 950, "loss": 0.6389, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.893311157806091e-05, "epoch": 1.631578947368421, "percentage": 32.63, "elapsed_time": "0:03:31", "remaining_time": "0:07:17"}
35
+ {"current_steps": 320, "total_steps": 950, "loss": 0.7223, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.822410025817406e-05, "epoch": 1.6842105263157894, "percentage": 33.68, "elapsed_time": "0:03:38", "remaining_time": "0:07:10"}
36
+ {"current_steps": 330, "total_steps": 950, "loss": 0.6948, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.7500000000000003e-05, "epoch": 1.736842105263158, "percentage": 34.74, "elapsed_time": "0:03:45", "remaining_time": "0:07:03"}
37
+ {"current_steps": 340, "total_steps": 950, "loss": 0.6658, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.67616370150689e-05, "epoch": 1.7894736842105263, "percentage": 35.79, "elapsed_time": "0:03:51", "remaining_time": "0:06:55"}
38
+ {"current_steps": 350, "total_steps": 950, "loss": 0.643, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.600985378894086e-05, "epoch": 1.8421052631578947, "percentage": 36.84, "elapsed_time": "0:03:58", "remaining_time": "0:06:48"}
39
+ {"current_steps": 360, "total_steps": 950, "loss": 0.6537, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.5245508119914687e-05, "epoch": 1.8947368421052633, "percentage": 37.89, "elapsed_time": "0:04:05", "remaining_time": "0:06:41"}
40
+ {"current_steps": 370, "total_steps": 950, "loss": 0.641, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.44694721402644e-05, "epoch": 1.9473684210526314, "percentage": 38.95, "elapsed_time": "0:04:12", "remaining_time": "0:06:35"}
41
+ {"current_steps": 380, "total_steps": 950, "loss": 0.6708, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.3682631321120504e-05, "epoch": 2.0, "percentage": 40.0, "elapsed_time": "0:04:18", "remaining_time": "0:06:27"}
42
+ {"current_steps": 390, "total_steps": 950, "loss": 0.5061, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.2885883462131394e-05, "epoch": 2.0526315789473686, "percentage": 41.05, "elapsed_time": "0:04:24", "remaining_time": "0:06:20"}
43
+ {"current_steps": 400, "total_steps": 950, "loss": 0.4876, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.2080137667057595e-05, "epoch": 2.1052631578947367, "percentage": 42.11, "elapsed_time": "0:04:31", "remaining_time": "0:06:13"}
44
+ {"current_steps": 400, "total_steps": 950, "loss": null, "eval_loss": 0.6224929690361023, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 2.1052631578947367, "percentage": 42.11, "elapsed_time": "0:04:31", "remaining_time": "0:06:13"}
45
+ {"current_steps": 410, "total_steps": 950, "loss": 0.485, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.126631330646802e-05, "epoch": 2.1578947368421053, "percentage": 43.16, "elapsed_time": "0:04:40", "remaining_time": "0:06:09"}
46
+ {"current_steps": 420, "total_steps": 950, "loss": 0.536, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.0445338968721287e-05, "epoch": 2.2105263157894735, "percentage": 44.21, "elapsed_time": "0:04:47", "remaining_time": "0:06:02"}
47
+ {"current_steps": 430, "total_steps": 950, "loss": 0.4493, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.961815140042974e-05, "epoch": 2.263157894736842, "percentage": 45.26, "elapsed_time": "0:04:53", "remaining_time": "0:05:55"}
48
+ {"current_steps": 440, "total_steps": 950, "loss": 0.4806, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.878569443761442e-05, "epoch": 2.3157894736842106, "percentage": 46.32, "elapsed_time": "0:05:00", "remaining_time": "0:05:48"}
49
+ {"current_steps": 450, "total_steps": 950, "loss": 0.4642, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.7948917928771158e-05, "epoch": 2.3684210526315788, "percentage": 47.37, "elapsed_time": "0:05:07", "remaining_time": "0:05:41"}
50
+ {"current_steps": 460, "total_steps": 950, "loss": 0.4857, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.7108776651076118e-05, "epoch": 2.4210526315789473, "percentage": 48.42, "elapsed_time": "0:05:13", "remaining_time": "0:05:34"}
51
+ {"current_steps": 470, "total_steps": 950, "loss": 0.4604, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.6266229220967818e-05, "epoch": 2.473684210526316, "percentage": 49.47, "elapsed_time": "0:05:20", "remaining_time": "0:05:27"}
52
+ {"current_steps": 480, "total_steps": 950, "loss": 0.4294, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.5422237000348276e-05, "epoch": 2.526315789473684, "percentage": 50.53, "elapsed_time": "0:05:27", "remaining_time": "0:05:20"}
53
+ {"current_steps": 490, "total_steps": 950, "loss": 0.436, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.4577762999651726e-05, "epoch": 2.5789473684210527, "percentage": 51.58, "elapsed_time": "0:05:34", "remaining_time": "0:05:13"}
54
+ {"current_steps": 500, "total_steps": 950, "loss": 0.4435, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.3733770779032184e-05, "epoch": 2.6315789473684212, "percentage": 52.63, "elapsed_time": "0:05:40", "remaining_time": "0:05:06"}
55
+ {"current_steps": 500, "total_steps": 950, "loss": null, "eval_loss": 0.5924356579780579, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 2.6315789473684212, "percentage": 52.63, "elapsed_time": "0:05:40", "remaining_time": "0:05:06"}
56
+ {"current_steps": 510, "total_steps": 950, "loss": 0.4128, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.2891223348923884e-05, "epoch": 2.6842105263157894, "percentage": 53.68, "elapsed_time": "0:05:49", "remaining_time": "0:05:01"}
57
+ {"current_steps": 520, "total_steps": 950, "loss": 0.4201, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.2051082071228854e-05, "epoch": 2.736842105263158, "percentage": 54.74, "elapsed_time": "0:05:55", "remaining_time": "0:04:54"}
58
+ {"current_steps": 530, "total_steps": 950, "loss": 0.4144, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.1214305562385592e-05, "epoch": 2.7894736842105265, "percentage": 55.79, "elapsed_time": "0:06:02", "remaining_time": "0:04:47"}
59
+ {"current_steps": 540, "total_steps": 950, "loss": 0.4325, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.0381848599570276e-05, "epoch": 2.8421052631578947, "percentage": 56.84, "elapsed_time": "0:06:09", "remaining_time": "0:04:40"}
60
+ {"current_steps": 550, "total_steps": 950, "loss": 0.4539, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.9554661031278712e-05, "epoch": 2.8947368421052633, "percentage": 57.89, "elapsed_time": "0:06:15", "remaining_time": "0:04:33"}
61
+ {"current_steps": 560, "total_steps": 950, "loss": 0.3898, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.8733686693531985e-05, "epoch": 2.9473684210526314, "percentage": 58.95, "elapsed_time": "0:06:22", "remaining_time": "0:04:26"}
62
+ {"current_steps": 570, "total_steps": 950, "loss": 0.4347, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.79198623329424e-05, "epoch": 3.0, "percentage": 60.0, "elapsed_time": "0:06:28", "remaining_time": "0:04:19"}
63
+ {"current_steps": 580, "total_steps": 950, "loss": 0.2771, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.711411653786861e-05, "epoch": 3.0526315789473686, "percentage": 61.05, "elapsed_time": "0:06:35", "remaining_time": "0:04:12"}
64
+ {"current_steps": 590, "total_steps": 950, "loss": 0.2786, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.6317368678879495e-05, "epoch": 3.1052631578947367, "percentage": 62.11, "elapsed_time": "0:06:41", "remaining_time": "0:04:05"}
65
+ {"current_steps": 600, "total_steps": 950, "loss": 0.2743, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.55305278597356e-05, "epoch": 3.1578947368421053, "percentage": 63.16, "elapsed_time": "0:06:48", "remaining_time": "0:03:58"}
66
+ {"current_steps": 600, "total_steps": 950, "loss": null, "eval_loss": 0.6151354908943176, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 3.1578947368421053, "percentage": 63.16, "elapsed_time": "0:06:48", "remaining_time": "0:03:58"}
67
+ {"current_steps": 610, "total_steps": 950, "loss": 0.2611, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.475449188008532e-05, "epoch": 3.2105263157894735, "percentage": 64.21, "elapsed_time": "0:06:57", "remaining_time": "0:03:52"}
68
+ {"current_steps": 620, "total_steps": 950, "loss": 0.237, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.399014621105914e-05, "epoch": 3.263157894736842, "percentage": 65.26, "elapsed_time": "0:07:03", "remaining_time": "0:03:45"}
69
+ {"current_steps": 630, "total_steps": 950, "loss": 0.2319, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.3238362984931113e-05, "epoch": 3.3157894736842106, "percentage": 66.32, "elapsed_time": "0:07:10", "remaining_time": "0:03:38"}
70
+ {"current_steps": 640, "total_steps": 950, "loss": 0.2785, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.2500000000000006e-05, "epoch": 3.3684210526315788, "percentage": 67.37, "elapsed_time": "0:07:17", "remaining_time": "0:03:31"}
71
+ {"current_steps": 650, "total_steps": 950, "loss": 0.3323, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.1775899741825947e-05, "epoch": 3.4210526315789473, "percentage": 68.42, "elapsed_time": "0:07:23", "remaining_time": "0:03:24"}
72
+ {"current_steps": 660, "total_steps": 950, "loss": 0.2762, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.1066888421939093e-05, "epoch": 3.473684210526316, "percentage": 69.47, "elapsed_time": "0:07:30", "remaining_time": "0:03:17"}
73
+ {"current_steps": 670, "total_steps": 950, "loss": 0.2982, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.0373775035117305e-05, "epoch": 3.526315789473684, "percentage": 70.53, "elapsed_time": "0:07:37", "remaining_time": "0:03:11"}
74
+ {"current_steps": 680, "total_steps": 950, "loss": 0.2338, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 9.697350436308427e-06, "epoch": 3.5789473684210527, "percentage": 71.58, "elapsed_time": "0:07:43", "remaining_time": "0:03:04"}
75
+ {"current_steps": 690, "total_steps": 950, "loss": 0.2962, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 9.038386438250415e-06, "epoch": 3.6315789473684212, "percentage": 72.63, "elapsed_time": "0:07:50", "remaining_time": "0:02:57"}
76
+ {"current_steps": 700, "total_steps": 950, "loss": 0.2846, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 8.397634930819021e-06, "epoch": 3.6842105263157894, "percentage": 73.68, "elapsed_time": "0:07:57", "remaining_time": "0:02:50"}
77
+ {"current_steps": 700, "total_steps": 950, "loss": null, "eval_loss": 0.6083844900131226, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 3.6842105263157894, "percentage": 73.68, "elapsed_time": "0:07:57", "remaining_time": "0:02:50"}
78
+ {"current_steps": 710, "total_steps": 950, "loss": 0.2895, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 7.775827023107835e-06, "epoch": 3.736842105263158, "percentage": 74.74, "elapsed_time": "0:08:06", "remaining_time": "0:02:44"}
79
+ {"current_steps": 720, "total_steps": 950, "loss": 0.3261, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 7.173672209219495e-06, "epoch": 3.7894736842105265, "percentage": 75.79, "elapsed_time": "0:08:13", "remaining_time": "0:02:37"}
80
+ {"current_steps": 730, "total_steps": 950, "loss": 0.2358, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 6.591857558720071e-06, "epoch": 3.8421052631578947, "percentage": 76.84, "elapsed_time": "0:08:20", "remaining_time": "0:02:30"}
81
+ {"current_steps": 740, "total_steps": 950, "loss": 0.2723, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 6.031046932680229e-06, "epoch": 3.8947368421052633, "percentage": 77.89, "elapsed_time": "0:08:26", "remaining_time": "0:02:23"}
82
+ {"current_steps": 750, "total_steps": 950, "loss": 0.2941, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 5.491880226197707e-06, "epoch": 3.9473684210526314, "percentage": 78.95, "elapsed_time": "0:08:33", "remaining_time": "0:02:16"}
83
+ {"current_steps": 760, "total_steps": 950, "loss": 0.2721, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.9749726382653905e-06, "epoch": 4.0, "percentage": 80.0, "elapsed_time": "0:08:39", "remaining_time": "0:02:09"}
84
+ {"current_steps": 770, "total_steps": 950, "loss": 0.1677, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.480913969818098e-06, "epoch": 4.052631578947368, "percentage": 81.05, "elapsed_time": "0:08:46", "remaining_time": "0:02:03"}
85
+ {"current_steps": 780, "total_steps": 950, "loss": 0.2291, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.010267950759025e-06, "epoch": 4.105263157894737, "percentage": 82.11, "elapsed_time": "0:08:53", "remaining_time": "0:01:56"}
86
+ {"current_steps": 790, "total_steps": 950, "loss": 0.1991, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.5635715967337223e-06, "epoch": 4.157894736842105, "percentage": 83.16, "elapsed_time": "0:09:00", "remaining_time": "0:01:49"}
87
+ {"current_steps": 800, "total_steps": 950, "loss": 0.2069, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.141334596385448e-06, "epoch": 4.2105263157894735, "percentage": 84.21, "elapsed_time": "0:09:06", "remaining_time": "0:01:42"}
88
+ {"current_steps": 800, "total_steps": 950, "loss": null, "eval_loss": 0.6427180767059326, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 4.2105263157894735, "percentage": 84.21, "elapsed_time": "0:09:06", "remaining_time": "0:01:42"}
89
+ {"current_steps": 810, "total_steps": 950, "loss": 0.2213, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.7440387297912123e-06, "epoch": 4.2631578947368425, "percentage": 85.26, "elapsed_time": "0:09:15", "remaining_time": "0:01:36"}
90
+ {"current_steps": 820, "total_steps": 950, "loss": 0.2008, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.372137318741968e-06, "epoch": 4.315789473684211, "percentage": 86.32, "elapsed_time": "0:09:22", "remaining_time": "0:01:29"}
91
+ {"current_steps": 830, "total_steps": 950, "loss": 0.2178, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.026054709494235e-06, "epoch": 4.368421052631579, "percentage": 87.37, "elapsed_time": "0:09:28", "remaining_time": "0:01:22"}
92
+ {"current_steps": 840, "total_steps": 950, "loss": 0.1878, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.7061857885832893e-06, "epoch": 4.421052631578947, "percentage": 88.42, "elapsed_time": "0:09:35", "remaining_time": "0:01:15"}
93
+ {"current_steps": 850, "total_steps": 950, "loss": 0.1733, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.4128955322504966e-06, "epoch": 4.473684210526316, "percentage": 89.47, "elapsed_time": "0:09:41", "remaining_time": "0:01:08"}
94
+ {"current_steps": 860, "total_steps": 950, "loss": 0.193, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.1465185899987797e-06, "epoch": 4.526315789473684, "percentage": 90.53, "elapsed_time": "0:09:48", "remaining_time": "0:01:01"}
95
+ {"current_steps": 870, "total_steps": 950, "loss": 0.1802, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 9.073589027514789e-07, "epoch": 4.578947368421053, "percentage": 91.58, "elapsed_time": "0:09:55", "remaining_time": "0:00:54"}
96
+ {"current_steps": 880, "total_steps": 950, "loss": 0.1736, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 6.956893560502359e-07, "epoch": 4.631578947368421, "percentage": 92.63, "elapsed_time": "0:10:01", "remaining_time": "0:00:47"}
97
+ {"current_steps": 890, "total_steps": 950, "loss": 0.1761, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 5.117514686876379e-07, "epoch": 4.684210526315789, "percentage": 93.68, "elapsed_time": "0:10:08", "remaining_time": "0:00:41"}
98
+ {"current_steps": 900, "total_steps": 950, "loss": 0.172, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.557551171299051e-07, "epoch": 4.7368421052631575, "percentage": 94.74, "elapsed_time": "0:10:14", "remaining_time": "0:00:34"}
99
+ {"current_steps": 900, "total_steps": 950, "loss": null, "eval_loss": 0.6494551301002502, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 4.7368421052631575, "percentage": 94.74, "elapsed_time": "0:10:14", "remaining_time": "0:00:34"}
100
+ {"current_steps": 910, "total_steps": 950, "loss": 0.1734, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.27878296044029e-07, "epoch": 4.7894736842105265, "percentage": 95.79, "elapsed_time": "0:10:23", "remaining_time": "0:00:27"}
101
+ {"current_steps": 920, "total_steps": 950, "loss": 0.1954, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.2826691520262114e-07, "epoch": 4.842105263157895, "percentage": 96.84, "elapsed_time": "0:10:30", "remaining_time": "0:00:20"}
102
+ {"current_steps": 930, "total_steps": 950, "loss": 0.1744, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 5.7034632998231865e-08, "epoch": 4.894736842105263, "percentage": 97.89, "elapsed_time": "0:10:37", "remaining_time": "0:00:13"}
103
+ {"current_steps": 940, "total_steps": 950, "loss": 0.1778, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.4262726757049982e-08, "epoch": 4.947368421052632, "percentage": 98.95, "elapsed_time": "0:10:43", "remaining_time": "0:00:06"}
104
+ {"current_steps": 950, "total_steps": 950, "loss": 0.1836, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.0, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "0:10:50", "remaining_time": "0:00:00"}
105
+ {"current_steps": 950, "total_steps": 950, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "0:10:50", "remaining_time": "0:00:00"}
106
+ {"current_steps": 10, "total_steps": 10, "loss": null, "eval_loss": 0.5924356579780579, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "0:11:00", "remaining_time": "0:00:00"}
llama2_13b_peft/linguistics_puzzles/trainer_state.json ADDED
@@ -0,0 +1,767 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5924356579780579,
3
+ "best_model_checkpoint": "ckpt/llama2_13b_other/linguistics_puzzles_no_sys/checkpoint-500",
4
+ "epoch": 5.0,
5
+ "eval_steps": 100,
6
+ "global_step": 950,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05263157894736842,
13
+ "grad_norm": 1.5050264596939087,
14
+ "learning_rate": 2.5e-05,
15
+ "loss": 2.5922,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.10526315789473684,
20
+ "grad_norm": 1.5525988340377808,
21
+ "learning_rate": 5e-05,
22
+ "loss": 2.3206,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.15789473684210525,
27
+ "grad_norm": 1.7404705286026,
28
+ "learning_rate": 4.998573727324295e-05,
29
+ "loss": 1.7229,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.21052631578947367,
34
+ "grad_norm": 1.8962088823318481,
35
+ "learning_rate": 4.994296536700177e-05,
36
+ "loss": 1.3729,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.2631578947368421,
41
+ "grad_norm": 1.776729941368103,
42
+ "learning_rate": 4.987173308479738e-05,
43
+ "loss": 1.3635,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.3157894736842105,
48
+ "grad_norm": 11.020795822143555,
49
+ "learning_rate": 4.977212170395598e-05,
50
+ "loss": 1.3315,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.3684210526315789,
55
+ "grad_norm": 2.192176580429077,
56
+ "learning_rate": 4.964424488287009e-05,
57
+ "loss": 1.2515,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.42105263157894735,
62
+ "grad_norm": 2.4063496589660645,
63
+ "learning_rate": 4.948824853131236e-05,
64
+ "loss": 1.1872,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.47368421052631576,
69
+ "grad_norm": 2.7862613201141357,
70
+ "learning_rate": 4.930431064394977e-05,
71
+ "loss": 1.1552,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.5263157894736842,
76
+ "grad_norm": 3.5330026149749756,
77
+ "learning_rate": 4.909264109724853e-05,
78
+ "loss": 1.1276,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.5263157894736842,
83
+ "eval_loss": 1.0876480340957642,
84
+ "eval_runtime": 1.9022,
85
+ "eval_samples_per_second": 42.057,
86
+ "eval_steps_per_second": 5.257,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 0.5789473684210527,
91
+ "grad_norm": 2.4774415493011475,
92
+ "learning_rate": 4.885348141000122e-05,
93
+ "loss": 1.1756,
94
+ "step": 110
95
+ },
96
+ {
97
+ "epoch": 0.631578947368421,
98
+ "grad_norm": 2.380500555038452,
99
+ "learning_rate": 4.858710446774951e-05,
100
+ "loss": 1.1106,
101
+ "step": 120
102
+ },
103
+ {
104
+ "epoch": 0.6842105263157895,
105
+ "grad_norm": 3.0656540393829346,
106
+ "learning_rate": 4.829381421141671e-05,
107
+ "loss": 1.0175,
108
+ "step": 130
109
+ },
110
+ {
111
+ "epoch": 0.7368421052631579,
112
+ "grad_norm": 5.256251811981201,
113
+ "learning_rate": 4.7973945290505766e-05,
114
+ "loss": 0.9733,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 0.7894736842105263,
119
+ "grad_norm": 2.674135446548462,
120
+ "learning_rate": 4.7627862681258037e-05,
121
+ "loss": 0.9907,
122
+ "step": 150
123
+ },
124
+ {
125
+ "epoch": 0.8421052631578947,
126
+ "grad_norm": 3.5206069946289062,
127
+ "learning_rate": 4.725596127020879e-05,
128
+ "loss": 0.9312,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 0.8947368421052632,
133
+ "grad_norm": 3.4086978435516357,
134
+ "learning_rate": 4.685866540361456e-05,
135
+ "loss": 0.9586,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 0.9473684210526315,
140
+ "grad_norm": 4.591642379760742,
141
+ "learning_rate": 4.643642840326627e-05,
142
+ "loss": 0.9595,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 1.0,
147
+ "grad_norm": 2.8823249340057373,
148
+ "learning_rate": 4.598973204924097e-05,
149
+ "loss": 0.8331,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 1.0526315789473684,
154
+ "grad_norm": 3.7064428329467773,
155
+ "learning_rate": 4.551908603018191e-05,
156
+ "loss": 0.8128,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 1.0526315789473684,
161
+ "eval_loss": 0.8153461217880249,
162
+ "eval_runtime": 1.9192,
163
+ "eval_samples_per_second": 41.684,
164
+ "eval_steps_per_second": 5.21,
165
+ "step": 200
166
+ },
167
+ {
168
+ "epoch": 1.1052631578947367,
169
+ "grad_norm": 4.2386274337768555,
170
+ "learning_rate": 4.502502736173462e-05,
171
+ "loss": 0.8186,
172
+ "step": 210
173
+ },
174
+ {
175
+ "epoch": 1.1578947368421053,
176
+ "grad_norm": 3.1767256259918213,
177
+ "learning_rate": 4.45081197738023e-05,
178
+ "loss": 0.6895,
179
+ "step": 220
180
+ },
181
+ {
182
+ "epoch": 1.2105263157894737,
183
+ "grad_norm": 3.748518466949463,
184
+ "learning_rate": 4.3968953067319777e-05,
185
+ "loss": 0.7901,
186
+ "step": 230
187
+ },
188
+ {
189
+ "epoch": 1.263157894736842,
190
+ "grad_norm": 3.807053565979004,
191
+ "learning_rate": 4.340814244127993e-05,
192
+ "loss": 0.704,
193
+ "step": 240
194
+ },
195
+ {
196
+ "epoch": 1.3157894736842106,
197
+ "grad_norm": 5.013542175292969,
198
+ "learning_rate": 4.282632779078051e-05,
199
+ "loss": 0.6879,
200
+ "step": 250
201
+ },
202
+ {
203
+ "epoch": 1.368421052631579,
204
+ "grad_norm": 4.752715110778809,
205
+ "learning_rate": 4.222417297689217e-05,
206
+ "loss": 0.7563,
207
+ "step": 260
208
+ },
209
+ {
210
+ "epoch": 1.4210526315789473,
211
+ "grad_norm": 3.6476950645446777,
212
+ "learning_rate": 4.160236506918098e-05,
213
+ "loss": 0.6846,
214
+ "step": 270
215
+ },
216
+ {
217
+ "epoch": 1.4736842105263157,
218
+ "grad_norm": 3.8758108615875244,
219
+ "learning_rate": 4.096161356174959e-05,
220
+ "loss": 0.7155,
221
+ "step": 280
222
+ },
223
+ {
224
+ "epoch": 1.526315789473684,
225
+ "grad_norm": 4.166601657867432,
226
+ "learning_rate": 4.030264956369157e-05,
227
+ "loss": 0.8037,
228
+ "step": 290
229
+ },
230
+ {
231
+ "epoch": 1.5789473684210527,
232
+ "grad_norm": 4.603171348571777,
233
+ "learning_rate": 3.962622496488269e-05,
234
+ "loss": 0.6705,
235
+ "step": 300
236
+ },
237
+ {
238
+ "epoch": 1.5789473684210527,
239
+ "eval_loss": 0.6891714930534363,
240
+ "eval_runtime": 1.9174,
241
+ "eval_samples_per_second": 41.724,
242
+ "eval_steps_per_second": 5.216,
243
+ "step": 300
244
+ },
245
+ {
246
+ "epoch": 1.631578947368421,
247
+ "grad_norm": 3.820142984390259,
248
+ "learning_rate": 3.893311157806091e-05,
249
+ "loss": 0.6389,
250
+ "step": 310
251
+ },
252
+ {
253
+ "epoch": 1.6842105263157894,
254
+ "grad_norm": 5.900814533233643,
255
+ "learning_rate": 3.822410025817406e-05,
256
+ "loss": 0.7223,
257
+ "step": 320
258
+ },
259
+ {
260
+ "epoch": 1.736842105263158,
261
+ "grad_norm": 4.315140724182129,
262
+ "learning_rate": 3.7500000000000003e-05,
263
+ "loss": 0.6948,
264
+ "step": 330
265
+ },
266
+ {
267
+ "epoch": 1.7894736842105263,
268
+ "grad_norm": 4.747324466705322,
269
+ "learning_rate": 3.67616370150689e-05,
270
+ "loss": 0.6658,
271
+ "step": 340
272
+ },
273
+ {
274
+ "epoch": 1.8421052631578947,
275
+ "grad_norm": 3.504014492034912,
276
+ "learning_rate": 3.600985378894086e-05,
277
+ "loss": 0.643,
278
+ "step": 350
279
+ },
280
+ {
281
+ "epoch": 1.8947368421052633,
282
+ "grad_norm": 5.181077480316162,
283
+ "learning_rate": 3.5245508119914687e-05,
284
+ "loss": 0.6537,
285
+ "step": 360
286
+ },
287
+ {
288
+ "epoch": 1.9473684210526314,
289
+ "grad_norm": 5.073149681091309,
290
+ "learning_rate": 3.44694721402644e-05,
291
+ "loss": 0.641,
292
+ "step": 370
293
+ },
294
+ {
295
+ "epoch": 2.0,
296
+ "grad_norm": 5.070895671844482,
297
+ "learning_rate": 3.3682631321120504e-05,
298
+ "loss": 0.6708,
299
+ "step": 380
300
+ },
301
+ {
302
+ "epoch": 2.0526315789473686,
303
+ "grad_norm": 5.305852890014648,
304
+ "learning_rate": 3.2885883462131394e-05,
305
+ "loss": 0.5061,
306
+ "step": 390
307
+ },
308
+ {
309
+ "epoch": 2.1052631578947367,
310
+ "grad_norm": 6.452213287353516,
311
+ "learning_rate": 3.2080137667057595e-05,
312
+ "loss": 0.4876,
313
+ "step": 400
314
+ },
315
+ {
316
+ "epoch": 2.1052631578947367,
317
+ "eval_loss": 0.6224929690361023,
318
+ "eval_runtime": 1.9167,
319
+ "eval_samples_per_second": 41.739,
320
+ "eval_steps_per_second": 5.217,
321
+ "step": 400
322
+ },
323
+ {
324
+ "epoch": 2.1578947368421053,
325
+ "grad_norm": 3.6080775260925293,
326
+ "learning_rate": 3.126631330646802e-05,
327
+ "loss": 0.485,
328
+ "step": 410
329
+ },
330
+ {
331
+ "epoch": 2.2105263157894735,
332
+ "grad_norm": 2.2630574703216553,
333
+ "learning_rate": 3.0445338968721287e-05,
334
+ "loss": 0.536,
335
+ "step": 420
336
+ },
337
+ {
338
+ "epoch": 2.263157894736842,
339
+ "grad_norm": 4.616273880004883,
340
+ "learning_rate": 2.961815140042974e-05,
341
+ "loss": 0.4493,
342
+ "step": 430
343
+ },
344
+ {
345
+ "epoch": 2.3157894736842106,
346
+ "grad_norm": 4.5297956466674805,
347
+ "learning_rate": 2.878569443761442e-05,
348
+ "loss": 0.4806,
349
+ "step": 440
350
+ },
351
+ {
352
+ "epoch": 2.3684210526315788,
353
+ "grad_norm": 4.910376071929932,
354
+ "learning_rate": 2.7948917928771158e-05,
355
+ "loss": 0.4642,
356
+ "step": 450
357
+ },
358
+ {
359
+ "epoch": 2.4210526315789473,
360
+ "grad_norm": 4.3276801109313965,
361
+ "learning_rate": 2.7108776651076118e-05,
362
+ "loss": 0.4857,
363
+ "step": 460
364
+ },
365
+ {
366
+ "epoch": 2.473684210526316,
367
+ "grad_norm": 3.657116413116455,
368
+ "learning_rate": 2.6266229220967818e-05,
369
+ "loss": 0.4604,
370
+ "step": 470
371
+ },
372
+ {
373
+ "epoch": 2.526315789473684,
374
+ "grad_norm": 4.7539896965026855,
375
+ "learning_rate": 2.5422237000348276e-05,
376
+ "loss": 0.4294,
377
+ "step": 480
378
+ },
379
+ {
380
+ "epoch": 2.5789473684210527,
381
+ "grad_norm": 4.227921962738037,
382
+ "learning_rate": 2.4577762999651726e-05,
383
+ "loss": 0.436,
384
+ "step": 490
385
+ },
386
+ {
387
+ "epoch": 2.6315789473684212,
388
+ "grad_norm": 6.821872234344482,
389
+ "learning_rate": 2.3733770779032184e-05,
390
+ "loss": 0.4435,
391
+ "step": 500
392
+ },
393
+ {
394
+ "epoch": 2.6315789473684212,
395
+ "eval_loss": 0.5924356579780579,
396
+ "eval_runtime": 1.9193,
397
+ "eval_samples_per_second": 41.683,
398
+ "eval_steps_per_second": 5.21,
399
+ "step": 500
400
+ },
401
+ {
402
+ "epoch": 2.6842105263157894,
403
+ "grad_norm": 4.023755073547363,
404
+ "learning_rate": 2.2891223348923884e-05,
405
+ "loss": 0.4128,
406
+ "step": 510
407
+ },
408
+ {
409
+ "epoch": 2.736842105263158,
410
+ "grad_norm": 4.245009899139404,
411
+ "learning_rate": 2.2051082071228854e-05,
412
+ "loss": 0.4201,
413
+ "step": 520
414
+ },
415
+ {
416
+ "epoch": 2.7894736842105265,
417
+ "grad_norm": 7.485212326049805,
418
+ "learning_rate": 2.1214305562385592e-05,
419
+ "loss": 0.4144,
420
+ "step": 530
421
+ },
422
+ {
423
+ "epoch": 2.8421052631578947,
424
+ "grad_norm": 3.890044689178467,
425
+ "learning_rate": 2.0381848599570276e-05,
426
+ "loss": 0.4325,
427
+ "step": 540
428
+ },
429
+ {
430
+ "epoch": 2.8947368421052633,
431
+ "grad_norm": 5.785126686096191,
432
+ "learning_rate": 1.9554661031278712e-05,
433
+ "loss": 0.4539,
434
+ "step": 550
435
+ },
436
+ {
437
+ "epoch": 2.9473684210526314,
438
+ "grad_norm": 3.959681272506714,
439
+ "learning_rate": 1.8733686693531985e-05,
440
+ "loss": 0.3898,
441
+ "step": 560
442
+ },
443
+ {
444
+ "epoch": 3.0,
445
+ "grad_norm": 6.1470160484313965,
446
+ "learning_rate": 1.79198623329424e-05,
447
+ "loss": 0.4347,
448
+ "step": 570
449
+ },
450
+ {
451
+ "epoch": 3.0526315789473686,
452
+ "grad_norm": 6.080893039703369,
453
+ "learning_rate": 1.711411653786861e-05,
454
+ "loss": 0.2771,
455
+ "step": 580
456
+ },
457
+ {
458
+ "epoch": 3.1052631578947367,
459
+ "grad_norm": 3.995936155319214,
460
+ "learning_rate": 1.6317368678879495e-05,
461
+ "loss": 0.2786,
462
+ "step": 590
463
+ },
464
+ {
465
+ "epoch": 3.1578947368421053,
466
+ "grad_norm": 4.9943084716796875,
467
+ "learning_rate": 1.55305278597356e-05,
468
+ "loss": 0.2743,
469
+ "step": 600
470
+ },
471
+ {
472
+ "epoch": 3.1578947368421053,
473
+ "eval_loss": 0.6151354908943176,
474
+ "eval_runtime": 1.9185,
475
+ "eval_samples_per_second": 41.7,
476
+ "eval_steps_per_second": 5.212,
477
+ "step": 600
478
+ },
479
+ {
480
+ "epoch": 3.2105263157894735,
481
+ "grad_norm": 3.650193452835083,
482
+ "learning_rate": 1.475449188008532e-05,
483
+ "loss": 0.2611,
484
+ "step": 610
485
+ },
486
+ {
487
+ "epoch": 3.263157894736842,
488
+ "grad_norm": 3.5425643920898438,
489
+ "learning_rate": 1.399014621105914e-05,
490
+ "loss": 0.237,
491
+ "step": 620
492
+ },
493
+ {
494
+ "epoch": 3.3157894736842106,
495
+ "grad_norm": 4.187167644500732,
496
+ "learning_rate": 1.3238362984931113e-05,
497
+ "loss": 0.2319,
498
+ "step": 630
499
+ },
500
+ {
501
+ "epoch": 3.3684210526315788,
502
+ "grad_norm": 3.7174108028411865,
503
+ "learning_rate": 1.2500000000000006e-05,
504
+ "loss": 0.2785,
505
+ "step": 640
506
+ },
507
+ {
508
+ "epoch": 3.4210526315789473,
509
+ "grad_norm": 4.665218353271484,
510
+ "learning_rate": 1.1775899741825947e-05,
511
+ "loss": 0.3323,
512
+ "step": 650
513
+ },
514
+ {
515
+ "epoch": 3.473684210526316,
516
+ "grad_norm": 6.711315631866455,
517
+ "learning_rate": 1.1066888421939093e-05,
518
+ "loss": 0.2762,
519
+ "step": 660
520
+ },
521
+ {
522
+ "epoch": 3.526315789473684,
523
+ "grad_norm": 4.101406097412109,
524
+ "learning_rate": 1.0373775035117305e-05,
525
+ "loss": 0.2982,
526
+ "step": 670
527
+ },
528
+ {
529
+ "epoch": 3.5789473684210527,
530
+ "grad_norm": 3.3571157455444336,
531
+ "learning_rate": 9.697350436308427e-06,
532
+ "loss": 0.2338,
533
+ "step": 680
534
+ },
535
+ {
536
+ "epoch": 3.6315789473684212,
537
+ "grad_norm": 7.152629852294922,
538
+ "learning_rate": 9.038386438250415e-06,
539
+ "loss": 0.2962,
540
+ "step": 690
541
+ },
542
+ {
543
+ "epoch": 3.6842105263157894,
544
+ "grad_norm": 5.147871971130371,
545
+ "learning_rate": 8.397634930819021e-06,
546
+ "loss": 0.2846,
547
+ "step": 700
548
+ },
549
+ {
550
+ "epoch": 3.6842105263157894,
551
+ "eval_loss": 0.6083844900131226,
552
+ "eval_runtime": 1.9199,
553
+ "eval_samples_per_second": 41.67,
554
+ "eval_steps_per_second": 5.209,
555
+ "step": 700
556
+ },
557
+ {
558
+ "epoch": 3.736842105263158,
559
+ "grad_norm": 3.984264373779297,
560
+ "learning_rate": 7.775827023107835e-06,
561
+ "loss": 0.2895,
562
+ "step": 710
563
+ },
564
+ {
565
+ "epoch": 3.7894736842105265,
566
+ "grad_norm": 6.230710983276367,
567
+ "learning_rate": 7.173672209219495e-06,
568
+ "loss": 0.3261,
569
+ "step": 720
570
+ },
571
+ {
572
+ "epoch": 3.8421052631578947,
573
+ "grad_norm": 3.685063362121582,
574
+ "learning_rate": 6.591857558720071e-06,
575
+ "loss": 0.2358,
576
+ "step": 730
577
+ },
578
+ {
579
+ "epoch": 3.8947368421052633,
580
+ "grad_norm": 4.337435245513916,
581
+ "learning_rate": 6.031046932680229e-06,
582
+ "loss": 0.2723,
583
+ "step": 740
584
+ },
585
+ {
586
+ "epoch": 3.9473684210526314,
587
+ "grad_norm": 4.504445552825928,
588
+ "learning_rate": 5.491880226197707e-06,
589
+ "loss": 0.2941,
590
+ "step": 750
591
+ },
592
+ {
593
+ "epoch": 4.0,
594
+ "grad_norm": 4.7959442138671875,
595
+ "learning_rate": 4.9749726382653905e-06,
596
+ "loss": 0.2721,
597
+ "step": 760
598
+ },
599
+ {
600
+ "epoch": 4.052631578947368,
601
+ "grad_norm": 2.663322925567627,
602
+ "learning_rate": 4.480913969818098e-06,
603
+ "loss": 0.1677,
604
+ "step": 770
605
+ },
606
+ {
607
+ "epoch": 4.105263157894737,
608
+ "grad_norm": 5.704188346862793,
609
+ "learning_rate": 4.010267950759025e-06,
610
+ "loss": 0.2291,
611
+ "step": 780
612
+ },
613
+ {
614
+ "epoch": 4.157894736842105,
615
+ "grad_norm": 4.857370853424072,
616
+ "learning_rate": 3.5635715967337223e-06,
617
+ "loss": 0.1991,
618
+ "step": 790
619
+ },
620
+ {
621
+ "epoch": 4.2105263157894735,
622
+ "grad_norm": 2.6290528774261475,
623
+ "learning_rate": 3.141334596385448e-06,
624
+ "loss": 0.2069,
625
+ "step": 800
626
+ },
627
+ {
628
+ "epoch": 4.2105263157894735,
629
+ "eval_loss": 0.6427180767059326,
630
+ "eval_runtime": 1.9195,
631
+ "eval_samples_per_second": 41.677,
632
+ "eval_steps_per_second": 5.21,
633
+ "step": 800
634
+ },
635
+ {
636
+ "epoch": 4.2631578947368425,
637
+ "grad_norm": 6.7939558029174805,
638
+ "learning_rate": 2.7440387297912123e-06,
639
+ "loss": 0.2213,
640
+ "step": 810
641
+ },
642
+ {
643
+ "epoch": 4.315789473684211,
644
+ "grad_norm": 5.425328731536865,
645
+ "learning_rate": 2.372137318741968e-06,
646
+ "loss": 0.2008,
647
+ "step": 820
648
+ },
649
+ {
650
+ "epoch": 4.368421052631579,
651
+ "grad_norm": 3.0159809589385986,
652
+ "learning_rate": 2.026054709494235e-06,
653
+ "loss": 0.2178,
654
+ "step": 830
655
+ },
656
+ {
657
+ "epoch": 4.421052631578947,
658
+ "grad_norm": 4.54276704788208,
659
+ "learning_rate": 1.7061857885832893e-06,
660
+ "loss": 0.1878,
661
+ "step": 840
662
+ },
663
+ {
664
+ "epoch": 4.473684210526316,
665
+ "grad_norm": 4.1157755851745605,
666
+ "learning_rate": 1.4128955322504966e-06,
667
+ "loss": 0.1733,
668
+ "step": 850
669
+ },
670
+ {
671
+ "epoch": 4.526315789473684,
672
+ "grad_norm": 4.860106945037842,
673
+ "learning_rate": 1.1465185899987797e-06,
674
+ "loss": 0.193,
675
+ "step": 860
676
+ },
677
+ {
678
+ "epoch": 4.578947368421053,
679
+ "grad_norm": 4.945047378540039,
680
+ "learning_rate": 9.073589027514789e-07,
681
+ "loss": 0.1802,
682
+ "step": 870
683
+ },
684
+ {
685
+ "epoch": 4.631578947368421,
686
+ "grad_norm": 2.316741943359375,
687
+ "learning_rate": 6.956893560502359e-07,
688
+ "loss": 0.1736,
689
+ "step": 880
690
+ },
691
+ {
692
+ "epoch": 4.684210526315789,
693
+ "grad_norm": 4.012813091278076,
694
+ "learning_rate": 5.117514686876379e-07,
695
+ "loss": 0.1761,
696
+ "step": 890
697
+ },
698
+ {
699
+ "epoch": 4.7368421052631575,
700
+ "grad_norm": 5.301681995391846,
701
+ "learning_rate": 3.557551171299051e-07,
702
+ "loss": 0.172,
703
+ "step": 900
704
+ },
705
+ {
706
+ "epoch": 4.7368421052631575,
707
+ "eval_loss": 0.6494551301002502,
708
+ "eval_runtime": 1.9201,
709
+ "eval_samples_per_second": 41.665,
710
+ "eval_steps_per_second": 5.208,
711
+ "step": 900
712
+ },
713
+ {
714
+ "epoch": 4.7894736842105265,
715
+ "grad_norm": 3.559140205383301,
716
+ "learning_rate": 2.27878296044029e-07,
717
+ "loss": 0.1734,
718
+ "step": 910
719
+ },
720
+ {
721
+ "epoch": 4.842105263157895,
722
+ "grad_norm": 7.743849277496338,
723
+ "learning_rate": 1.2826691520262114e-07,
724
+ "loss": 0.1954,
725
+ "step": 920
726
+ },
727
+ {
728
+ "epoch": 4.894736842105263,
729
+ "grad_norm": 3.5408854484558105,
730
+ "learning_rate": 5.7034632998231865e-08,
731
+ "loss": 0.1744,
732
+ "step": 930
733
+ },
734
+ {
735
+ "epoch": 4.947368421052632,
736
+ "grad_norm": 2.413121461868286,
737
+ "learning_rate": 1.4262726757049982e-08,
738
+ "loss": 0.1778,
739
+ "step": 940
740
+ },
741
+ {
742
+ "epoch": 5.0,
743
+ "grad_norm": 2.56962513923645,
744
+ "learning_rate": 0.0,
745
+ "loss": 0.1836,
746
+ "step": 950
747
+ },
748
+ {
749
+ "epoch": 5.0,
750
+ "step": 950,
751
+ "total_flos": 2.0275085174217114e+17,
752
+ "train_loss": 0.5822552880487945,
753
+ "train_runtime": 660.0352,
754
+ "train_samples_per_second": 11.515,
755
+ "train_steps_per_second": 1.439
756
+ }
757
+ ],
758
+ "logging_steps": 10,
759
+ "max_steps": 950,
760
+ "num_input_tokens_seen": 0,
761
+ "num_train_epochs": 5,
762
+ "save_steps": 100,
763
+ "total_flos": 2.0275085174217114e+17,
764
+ "train_batch_size": 4,
765
+ "trial_name": null,
766
+ "trial_params": null
767
+ }
llama2_13b_peft/linguistics_puzzles/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67ac58d8b967dcc701c74de72e5e18349db160299022d297808b6aa2f75860a0
3
+ size 5176
llama2_13b_peft/linguistics_puzzles/training_eval_loss.png ADDED
llama2_13b_peft/linguistics_puzzles/training_loss.png ADDED
llama2_13b_peft/news_commentary_de/README.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - llama-factory
6
+ - lora
7
+ - generated_from_trainer
8
+ base_model: /data1/model/llama2/meta-llama/Llama2-13b
9
+ model-index:
10
+ - name: news_commentary_de_no_sys
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # news_commentary_de_no_sys
18
+
19
+ This model is a fine-tuned version of [/data1/model/llama2/meta-llama/Llama2-13b](https://huggingface.co//data1/model/llama2/meta-llama/Llama2-13b) on the news_commentary_de_no_sys dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.6944
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 1e-05
41
+ - train_batch_size: 8
42
+ - eval_batch_size: 8
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 2
46
+ - total_train_batch_size: 16
47
+ - total_eval_batch_size: 16
48
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
+ - lr_scheduler_type: cosine
50
+ - lr_scheduler_warmup_steps: 20
51
+ - num_epochs: 10.0
52
+
53
+ ### Training results
54
+
55
+ | Training Loss | Epoch | Step | Validation Loss |
56
+ |:-------------:|:-----:|:----:|:---------------:|
57
+ | 0.7429 | 0.13 | 200 | 0.7712 |
58
+ | 0.7549 | 0.25 | 400 | 0.7434 |
59
+ | 0.7552 | 0.38 | 600 | 0.7330 |
60
+ | 0.7265 | 0.5 | 800 | 0.7256 |
61
+ | 0.7524 | 0.63 | 1000 | 0.7200 |
62
+ | 0.6976 | 0.75 | 1200 | 0.7151 |
63
+ | 0.7408 | 0.88 | 1400 | 0.7116 |
64
+ | 0.701 | 1.0 | 1600 | 0.7085 |
65
+ | 0.7084 | 1.13 | 1800 | 0.7059 |
66
+ | 0.6999 | 1.25 | 2000 | 0.7040 |
67
+ | 0.7182 | 1.38 | 2200 | 0.7022 |
68
+ | 0.7267 | 1.51 | 2400 | 0.6994 |
69
+ | 0.6912 | 1.63 | 2600 | 0.6972 |
70
+ | 0.6821 | 1.76 | 2800 | 0.6954 |
71
+ | 0.7104 | 1.88 | 3000 | 0.6944 |
72
+ | 0.6222 | 2.01 | 3200 | 0.6934 |
73
+ | 0.6383 | 2.13 | 3400 | 0.6974 |
74
+ | 0.6436 | 2.26 | 3600 | 0.6981 |
75
+ | 0.6444 | 2.38 | 3800 | 0.6968 |
76
+ | 0.6368 | 2.51 | 4000 | 0.6987 |
77
+
78
+
79
+ ### Framework versions
80
+
81
+ - PEFT 0.9.0
82
+ - Transformers 4.38.2
83
+ - Pytorch 2.2.1
84
+ - Datasets 2.18.0
85
+ - Tokenizers 0.15.2
llama2_13b_peft/news_commentary_de/adapter_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/data1/model/llama2/meta-llama/Llama2-13b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.0,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 8,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "up_proj",
23
+ "down_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "o_proj",
27
+ "gate_proj",
28
+ "q_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "use_dora": false,
32
+ "use_rslora": false
33
+ }
llama2_13b_peft/news_commentary_de/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44a2422055e9542643288a7443b823001443ae5a402e2cff85e691f7121a6398
3
+ size 125248064
llama2_13b_peft/news_commentary_de/all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.51,
3
+ "eval_loss": 0.6943792104721069,
4
+ "eval_runtime": 64.8294,
5
+ "eval_samples_per_second": 69.413,
6
+ "eval_steps_per_second": 4.35,
7
+ "train_loss": 0.7081527805328369,
8
+ "train_runtime": 4312.5386,
9
+ "train_samples_per_second": 59.13,
10
+ "train_steps_per_second": 3.696
11
+ }
llama2_13b_peft/news_commentary_de/eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.51,
3
+ "eval_loss": 0.6943792104721069,
4
+ "eval_runtime": 64.8294,
5
+ "eval_samples_per_second": 69.413,
6
+ "eval_steps_per_second": 4.35
7
+ }
llama2_13b_peft/news_commentary_de/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
llama2_13b_peft/news_commentary_de/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
llama2_13b_peft/news_commentary_de/tokenizer_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "LlamaTokenizer",
43
+ "unk_token": "<unk>",
44
+ "use_default_system_prompt": false
45
+ }
llama2_13b_peft/news_commentary_de/train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.51,
3
+ "train_loss": 0.7081527805328369,
4
+ "train_runtime": 4312.5386,
5
+ "train_samples_per_second": 59.13,
6
+ "train_steps_per_second": 3.696
7
+ }
llama2_13b_peft/news_commentary_de/trainer_log.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llama2_13b_peft/news_commentary_de/trainer_state.json ADDED
@@ -0,0 +1,2990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6943792104721069,
3
+ "best_model_checkpoint": "ckpt/llama2_13b_fuze30_no_sys/news_commentary_de_no_sys/checkpoint-3000",
4
+ "epoch": 2.509410288582183,
5
+ "eval_steps": 200,
6
+ "global_step": 4000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01,
13
+ "grad_norm": 0.5409977436065674,
14
+ "learning_rate": 5e-06,
15
+ "loss": 1.3994,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.01,
20
+ "grad_norm": 0.850004255771637,
21
+ "learning_rate": 1e-05,
22
+ "loss": 1.4561,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.02,
27
+ "grad_norm": 0.8501812219619751,
28
+ "learning_rate": 9.999990264607035e-06,
29
+ "loss": 1.3697,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.03,
34
+ "grad_norm": 0.6338475346565247,
35
+ "learning_rate": 9.999961058466052e-06,
36
+ "loss": 1.3627,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.03,
41
+ "grad_norm": 0.7430967688560486,
42
+ "learning_rate": 9.999912381690781e-06,
43
+ "loss": 1.1155,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.04,
48
+ "grad_norm": 0.5487976670265198,
49
+ "learning_rate": 9.999844234470782e-06,
50
+ "loss": 0.9492,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.04,
55
+ "grad_norm": 0.3653506934642792,
56
+ "learning_rate": 9.999756617071427e-06,
57
+ "loss": 0.9067,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.05,
62
+ "grad_norm": 0.38920339941978455,
63
+ "learning_rate": 9.999649529833915e-06,
64
+ "loss": 0.8848,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.06,
69
+ "grad_norm": 0.4155251979827881,
70
+ "learning_rate": 9.999522973175257e-06,
71
+ "loss": 0.798,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.06,
76
+ "grad_norm": 0.4156494438648224,
77
+ "learning_rate": 9.999376947588288e-06,
78
+ "loss": 0.8782,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.07,
83
+ "grad_norm": 0.4306489825248718,
84
+ "learning_rate": 9.99921145364165e-06,
85
+ "loss": 0.8124,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.08,
90
+ "grad_norm": 0.39355072379112244,
91
+ "learning_rate": 9.999026491979809e-06,
92
+ "loss": 0.838,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.08,
97
+ "grad_norm": 0.4246688783168793,
98
+ "learning_rate": 9.99882206332303e-06,
99
+ "loss": 0.8383,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.09,
104
+ "grad_norm": 0.47585156559944153,
105
+ "learning_rate": 9.99859816846739e-06,
106
+ "loss": 0.8705,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.09,
111
+ "grad_norm": 0.48569419980049133,
112
+ "learning_rate": 9.998354808284774e-06,
113
+ "loss": 0.7872,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.1,
118
+ "grad_norm": 0.5107733011245728,
119
+ "learning_rate": 9.998091983722862e-06,
120
+ "loss": 0.789,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.11,
125
+ "grad_norm": 0.5669977068901062,
126
+ "learning_rate": 9.997809695805136e-06,
127
+ "loss": 0.7749,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.11,
132
+ "grad_norm": 0.49600809812545776,
133
+ "learning_rate": 9.99750794563087e-06,
134
+ "loss": 0.7935,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.12,
139
+ "grad_norm": 0.45251163840293884,
140
+ "learning_rate": 9.997186734375124e-06,
141
+ "loss": 0.7817,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.13,
146
+ "grad_norm": 0.46742165088653564,
147
+ "learning_rate": 9.996846063288746e-06,
148
+ "loss": 0.7429,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.13,
153
+ "eval_loss": 0.7712445855140686,
154
+ "eval_runtime": 64.6782,
155
+ "eval_samples_per_second": 69.575,
156
+ "eval_steps_per_second": 4.36,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 0.13,
161
+ "grad_norm": 0.5643576383590698,
162
+ "learning_rate": 9.996485933698364e-06,
163
+ "loss": 0.7636,
164
+ "step": 210
165
+ },
166
+ {
167
+ "epoch": 0.14,
168
+ "grad_norm": 0.4915783405303955,
169
+ "learning_rate": 9.996106347006378e-06,
170
+ "loss": 0.7856,
171
+ "step": 220
172
+ },
173
+ {
174
+ "epoch": 0.14,
175
+ "grad_norm": 0.3926757574081421,
176
+ "learning_rate": 9.99570730469096e-06,
177
+ "loss": 0.7529,
178
+ "step": 230
179
+ },
180
+ {
181
+ "epoch": 0.15,
182
+ "grad_norm": 0.3297576606273651,
183
+ "learning_rate": 9.995288808306041e-06,
184
+ "loss": 0.7671,
185
+ "step": 240
186
+ },
187
+ {
188
+ "epoch": 0.16,
189
+ "grad_norm": 0.45379459857940674,
190
+ "learning_rate": 9.994850859481312e-06,
191
+ "loss": 0.7231,
192
+ "step": 250
193
+ },
194
+ {
195
+ "epoch": 0.16,
196
+ "grad_norm": 0.5688673853874207,
197
+ "learning_rate": 9.994393459922219e-06,
198
+ "loss": 0.7694,
199
+ "step": 260
200
+ },
201
+ {
202
+ "epoch": 0.17,
203
+ "grad_norm": 0.6590914130210876,
204
+ "learning_rate": 9.993916611409941e-06,
205
+ "loss": 0.7661,
206
+ "step": 270
207
+ },
208
+ {
209
+ "epoch": 0.18,
210
+ "grad_norm": 0.4207383692264557,
211
+ "learning_rate": 9.993420315801406e-06,
212
+ "loss": 0.7952,
213
+ "step": 280
214
+ },
215
+ {
216
+ "epoch": 0.18,
217
+ "grad_norm": 0.47460174560546875,
218
+ "learning_rate": 9.992904575029265e-06,
219
+ "loss": 0.7966,
220
+ "step": 290
221
+ },
222
+ {
223
+ "epoch": 0.19,
224
+ "grad_norm": 0.6118924617767334,
225
+ "learning_rate": 9.992369391101895e-06,
226
+ "loss": 0.8167,
227
+ "step": 300
228
+ },
229
+ {
230
+ "epoch": 0.19,
231
+ "grad_norm": 0.44934767484664917,
232
+ "learning_rate": 9.991814766103386e-06,
233
+ "loss": 0.7368,
234
+ "step": 310
235
+ },
236
+ {
237
+ "epoch": 0.2,
238
+ "grad_norm": 0.5106733441352844,
239
+ "learning_rate": 9.991240702193532e-06,
240
+ "loss": 0.7796,
241
+ "step": 320
242
+ },
243
+ {
244
+ "epoch": 0.21,
245
+ "grad_norm": 0.4405980706214905,
246
+ "learning_rate": 9.99064720160783e-06,
247
+ "loss": 0.7727,
248
+ "step": 330
249
+ },
250
+ {
251
+ "epoch": 0.21,
252
+ "grad_norm": 0.6010485887527466,
253
+ "learning_rate": 9.990034266657468e-06,
254
+ "loss": 0.7604,
255
+ "step": 340
256
+ },
257
+ {
258
+ "epoch": 0.22,
259
+ "grad_norm": 0.6098916530609131,
260
+ "learning_rate": 9.989401899729307e-06,
261
+ "loss": 0.7399,
262
+ "step": 350
263
+ },
264
+ {
265
+ "epoch": 0.23,
266
+ "grad_norm": 0.5837363004684448,
267
+ "learning_rate": 9.988750103285883e-06,
268
+ "loss": 0.7715,
269
+ "step": 360
270
+ },
271
+ {
272
+ "epoch": 0.23,
273
+ "grad_norm": 0.49089643359184265,
274
+ "learning_rate": 9.988078879865396e-06,
275
+ "loss": 0.738,
276
+ "step": 370
277
+ },
278
+ {
279
+ "epoch": 0.24,
280
+ "grad_norm": 0.508166491985321,
281
+ "learning_rate": 9.987388232081694e-06,
282
+ "loss": 0.8025,
283
+ "step": 380
284
+ },
285
+ {
286
+ "epoch": 0.24,
287
+ "grad_norm": 0.6415013074874878,
288
+ "learning_rate": 9.98667816262427e-06,
289
+ "loss": 0.7561,
290
+ "step": 390
291
+ },
292
+ {
293
+ "epoch": 0.25,
294
+ "grad_norm": 0.5850837230682373,
295
+ "learning_rate": 9.985948674258243e-06,
296
+ "loss": 0.7549,
297
+ "step": 400
298
+ },
299
+ {
300
+ "epoch": 0.25,
301
+ "eval_loss": 0.743410587310791,
302
+ "eval_runtime": 64.8376,
303
+ "eval_samples_per_second": 69.404,
304
+ "eval_steps_per_second": 4.349,
305
+ "step": 400
306
+ },
307
+ {
308
+ "epoch": 0.26,
309
+ "grad_norm": 0.627358615398407,
310
+ "learning_rate": 9.985199769824359e-06,
311
+ "loss": 0.7694,
312
+ "step": 410
313
+ },
314
+ {
315
+ "epoch": 0.26,
316
+ "grad_norm": 0.7586867213249207,
317
+ "learning_rate": 9.984431452238968e-06,
318
+ "loss": 0.7353,
319
+ "step": 420
320
+ },
321
+ {
322
+ "epoch": 0.27,
323
+ "grad_norm": 0.5713008642196655,
324
+ "learning_rate": 9.983643724494017e-06,
325
+ "loss": 0.7299,
326
+ "step": 430
327
+ },
328
+ {
329
+ "epoch": 0.28,
330
+ "grad_norm": 0.5664968490600586,
331
+ "learning_rate": 9.982836589657043e-06,
332
+ "loss": 0.754,
333
+ "step": 440
334
+ },
335
+ {
336
+ "epoch": 0.28,
337
+ "grad_norm": 0.4575900435447693,
338
+ "learning_rate": 9.98201005087116e-06,
339
+ "loss": 0.7355,
340
+ "step": 450
341
+ },
342
+ {
343
+ "epoch": 0.29,
344
+ "grad_norm": 0.6498897075653076,
345
+ "learning_rate": 9.981164111355036e-06,
346
+ "loss": 0.7543,
347
+ "step": 460
348
+ },
349
+ {
350
+ "epoch": 0.29,
351
+ "grad_norm": 0.6509144306182861,
352
+ "learning_rate": 9.98029877440289e-06,
353
+ "loss": 0.7568,
354
+ "step": 470
355
+ },
356
+ {
357
+ "epoch": 0.3,
358
+ "grad_norm": 0.44653260707855225,
359
+ "learning_rate": 9.979414043384485e-06,
360
+ "loss": 0.7313,
361
+ "step": 480
362
+ },
363
+ {
364
+ "epoch": 0.31,
365
+ "grad_norm": 0.7275229096412659,
366
+ "learning_rate": 9.978509921745101e-06,
367
+ "loss": 0.7456,
368
+ "step": 490
369
+ },
370
+ {
371
+ "epoch": 0.31,
372
+ "grad_norm": 0.4918762147426605,
373
+ "learning_rate": 9.97758641300553e-06,
374
+ "loss": 0.7585,
375
+ "step": 500
376
+ },
377
+ {
378
+ "epoch": 0.32,
379
+ "grad_norm": 0.5181304216384888,
380
+ "learning_rate": 9.97664352076206e-06,
381
+ "loss": 0.7311,
382
+ "step": 510
383
+ },
384
+ {
385
+ "epoch": 0.33,
386
+ "grad_norm": 0.5354281663894653,
387
+ "learning_rate": 9.97568124868646e-06,
388
+ "loss": 0.7173,
389
+ "step": 520
390
+ },
391
+ {
392
+ "epoch": 0.33,
393
+ "grad_norm": 0.47694316506385803,
394
+ "learning_rate": 9.974699600525972e-06,
395
+ "loss": 0.7408,
396
+ "step": 530
397
+ },
398
+ {
399
+ "epoch": 0.34,
400
+ "grad_norm": 0.5888867974281311,
401
+ "learning_rate": 9.973698580103286e-06,
402
+ "loss": 0.757,
403
+ "step": 540
404
+ },
405
+ {
406
+ "epoch": 0.35,
407
+ "grad_norm": 0.7656754851341248,
408
+ "learning_rate": 9.972678191316533e-06,
409
+ "loss": 0.7717,
410
+ "step": 550
411
+ },
412
+ {
413
+ "epoch": 0.35,
414
+ "grad_norm": 0.5808092355728149,
415
+ "learning_rate": 9.971638438139266e-06,
416
+ "loss": 0.7314,
417
+ "step": 560
418
+ },
419
+ {
420
+ "epoch": 0.36,
421
+ "grad_norm": 0.5002965331077576,
422
+ "learning_rate": 9.97057932462045e-06,
423
+ "loss": 0.7112,
424
+ "step": 570
425
+ },
426
+ {
427
+ "epoch": 0.36,
428
+ "grad_norm": 0.6044530272483826,
429
+ "learning_rate": 9.96950085488444e-06,
430
+ "loss": 0.7802,
431
+ "step": 580
432
+ },
433
+ {
434
+ "epoch": 0.37,
435
+ "grad_norm": 0.48741769790649414,
436
+ "learning_rate": 9.968403033130963e-06,
437
+ "loss": 0.7472,
438
+ "step": 590
439
+ },
440
+ {
441
+ "epoch": 0.38,
442
+ "grad_norm": 0.4956966042518616,
443
+ "learning_rate": 9.967285863635112e-06,
444
+ "loss": 0.7552,
445
+ "step": 600
446
+ },
447
+ {
448
+ "epoch": 0.38,
449
+ "eval_loss": 0.733000636100769,
450
+ "eval_runtime": 65.6052,
451
+ "eval_samples_per_second": 68.592,
452
+ "eval_steps_per_second": 4.298,
453
+ "step": 600
454
+ },
455
+ {
456
+ "epoch": 0.38,
457
+ "grad_norm": 0.528469979763031,
458
+ "learning_rate": 9.966149350747321e-06,
459
+ "loss": 0.7274,
460
+ "step": 610
461
+ },
462
+ {
463
+ "epoch": 0.39,
464
+ "grad_norm": 0.5717535614967346,
465
+ "learning_rate": 9.964993498893349e-06,
466
+ "loss": 0.7734,
467
+ "step": 620
468
+ },
469
+ {
470
+ "epoch": 0.4,
471
+ "grad_norm": 0.5049377083778381,
472
+ "learning_rate": 9.963818312574265e-06,
473
+ "loss": 0.7117,
474
+ "step": 630
475
+ },
476
+ {
477
+ "epoch": 0.4,
478
+ "grad_norm": 0.7002434134483337,
479
+ "learning_rate": 9.962623796366428e-06,
480
+ "loss": 0.7256,
481
+ "step": 640
482
+ },
483
+ {
484
+ "epoch": 0.41,
485
+ "grad_norm": 0.6600221991539001,
486
+ "learning_rate": 9.961409954921472e-06,
487
+ "loss": 0.764,
488
+ "step": 650
489
+ },
490
+ {
491
+ "epoch": 0.41,
492
+ "grad_norm": 0.5288920402526855,
493
+ "learning_rate": 9.96017679296629e-06,
494
+ "loss": 0.7385,
495
+ "step": 660
496
+ },
497
+ {
498
+ "epoch": 0.42,
499
+ "grad_norm": 0.6407844424247742,
500
+ "learning_rate": 9.958924315303005e-06,
501
+ "loss": 0.7386,
502
+ "step": 670
503
+ },
504
+ {
505
+ "epoch": 0.43,
506
+ "grad_norm": 0.6425316333770752,
507
+ "learning_rate": 9.95765252680896e-06,
508
+ "loss": 0.7013,
509
+ "step": 680
510
+ },
511
+ {
512
+ "epoch": 0.43,
513
+ "grad_norm": 0.6219075918197632,
514
+ "learning_rate": 9.956361432436705e-06,
515
+ "loss": 0.7104,
516
+ "step": 690
517
+ },
518
+ {
519
+ "epoch": 0.44,
520
+ "grad_norm": 0.5872789621353149,
521
+ "learning_rate": 9.95505103721396e-06,
522
+ "loss": 0.6988,
523
+ "step": 700
524
+ },
525
+ {
526
+ "epoch": 0.45,
527
+ "grad_norm": 0.8937903642654419,
528
+ "learning_rate": 9.953721346243613e-06,
529
+ "loss": 0.7177,
530
+ "step": 710
531
+ },
532
+ {
533
+ "epoch": 0.45,
534
+ "grad_norm": 0.5471718311309814,
535
+ "learning_rate": 9.952372364703688e-06,
536
+ "loss": 0.6804,
537
+ "step": 720
538
+ },
539
+ {
540
+ "epoch": 0.46,
541
+ "grad_norm": 0.7264242172241211,
542
+ "learning_rate": 9.95100409784733e-06,
543
+ "loss": 0.7432,
544
+ "step": 730
545
+ },
546
+ {
547
+ "epoch": 0.46,
548
+ "grad_norm": 0.7826697826385498,
549
+ "learning_rate": 9.949616551002787e-06,
550
+ "loss": 0.7521,
551
+ "step": 740
552
+ },
553
+ {
554
+ "epoch": 0.47,
555
+ "grad_norm": 0.6297461986541748,
556
+ "learning_rate": 9.948209729573384e-06,
557
+ "loss": 0.7624,
558
+ "step": 750
559
+ },
560
+ {
561
+ "epoch": 0.48,
562
+ "grad_norm": 0.7424671053886414,
563
+ "learning_rate": 9.946783639037503e-06,
564
+ "loss": 0.7619,
565
+ "step": 760
566
+ },
567
+ {
568
+ "epoch": 0.48,
569
+ "grad_norm": 0.5803889632225037,
570
+ "learning_rate": 9.945338284948568e-06,
571
+ "loss": 0.7159,
572
+ "step": 770
573
+ },
574
+ {
575
+ "epoch": 0.49,
576
+ "grad_norm": 0.895115077495575,
577
+ "learning_rate": 9.943873672935014e-06,
578
+ "loss": 0.7621,
579
+ "step": 780
580
+ },
581
+ {
582
+ "epoch": 0.5,
583
+ "grad_norm": 0.5325012803077698,
584
+ "learning_rate": 9.94238980870027e-06,
585
+ "loss": 0.6923,
586
+ "step": 790
587
+ },
588
+ {
589
+ "epoch": 0.5,
590
+ "grad_norm": 0.7378474473953247,
591
+ "learning_rate": 9.940886698022733e-06,
592
+ "loss": 0.7265,
593
+ "step": 800
594
+ },
595
+ {
596
+ "epoch": 0.5,
597
+ "eval_loss": 0.7256230711936951,
598
+ "eval_runtime": 70.6462,
599
+ "eval_samples_per_second": 63.698,
600
+ "eval_steps_per_second": 3.992,
601
+ "step": 800
602
+ },
603
+ {
604
+ "epoch": 0.51,
605
+ "grad_norm": 0.6454309225082397,
606
+ "learning_rate": 9.93936434675576e-06,
607
+ "loss": 0.6976,
608
+ "step": 810
609
+ },
610
+ {
611
+ "epoch": 0.51,
612
+ "grad_norm": 0.7399590015411377,
613
+ "learning_rate": 9.93782276082762e-06,
614
+ "loss": 0.7028,
615
+ "step": 820
616
+ },
617
+ {
618
+ "epoch": 0.52,
619
+ "grad_norm": 0.6661127209663391,
620
+ "learning_rate": 9.936261946241492e-06,
621
+ "loss": 0.7253,
622
+ "step": 830
623
+ },
624
+ {
625
+ "epoch": 0.53,
626
+ "grad_norm": 0.5480040907859802,
627
+ "learning_rate": 9.934681909075434e-06,
628
+ "loss": 0.7096,
629
+ "step": 840
630
+ },
631
+ {
632
+ "epoch": 0.53,
633
+ "grad_norm": 0.6889688968658447,
634
+ "learning_rate": 9.93308265548236e-06,
635
+ "loss": 0.745,
636
+ "step": 850
637
+ },
638
+ {
639
+ "epoch": 0.54,
640
+ "grad_norm": 0.6629202961921692,
641
+ "learning_rate": 9.931464191690015e-06,
642
+ "loss": 0.7111,
643
+ "step": 860
644
+ },
645
+ {
646
+ "epoch": 0.55,
647
+ "grad_norm": 0.5166647434234619,
648
+ "learning_rate": 9.929826524000948e-06,
649
+ "loss": 0.7296,
650
+ "step": 870
651
+ },
652
+ {
653
+ "epoch": 0.55,
654
+ "grad_norm": 0.6730151772499084,
655
+ "learning_rate": 9.928169658792498e-06,
656
+ "loss": 0.7387,
657
+ "step": 880
658
+ },
659
+ {
660
+ "epoch": 0.56,
661
+ "grad_norm": 0.6847391724586487,
662
+ "learning_rate": 9.926493602516758e-06,
663
+ "loss": 0.7156,
664
+ "step": 890
665
+ },
666
+ {
667
+ "epoch": 0.56,
668
+ "grad_norm": 0.7915560007095337,
669
+ "learning_rate": 9.924798361700554e-06,
670
+ "loss": 0.7956,
671
+ "step": 900
672
+ },
673
+ {
674
+ "epoch": 0.57,
675
+ "grad_norm": 0.5927907824516296,
676
+ "learning_rate": 9.923083942945419e-06,
677
+ "loss": 0.7361,
678
+ "step": 910
679
+ },
680
+ {
681
+ "epoch": 0.58,
682
+ "grad_norm": 0.7477264404296875,
683
+ "learning_rate": 9.92135035292757e-06,
684
+ "loss": 0.7091,
685
+ "step": 920
686
+ },
687
+ {
688
+ "epoch": 0.58,
689
+ "grad_norm": 0.7492902278900146,
690
+ "learning_rate": 9.919597598397882e-06,
691
+ "loss": 0.6967,
692
+ "step": 930
693
+ },
694
+ {
695
+ "epoch": 0.59,
696
+ "grad_norm": 0.7357175350189209,
697
+ "learning_rate": 9.91782568618185e-06,
698
+ "loss": 0.7509,
699
+ "step": 940
700
+ },
701
+ {
702
+ "epoch": 0.6,
703
+ "grad_norm": 0.6629440188407898,
704
+ "learning_rate": 9.916034623179584e-06,
705
+ "loss": 0.6999,
706
+ "step": 950
707
+ },
708
+ {
709
+ "epoch": 0.6,
710
+ "grad_norm": 0.5954321026802063,
711
+ "learning_rate": 9.914224416365765e-06,
712
+ "loss": 0.7194,
713
+ "step": 960
714
+ },
715
+ {
716
+ "epoch": 0.61,
717
+ "grad_norm": 0.9139691591262817,
718
+ "learning_rate": 9.91239507278962e-06,
719
+ "loss": 0.705,
720
+ "step": 970
721
+ },
722
+ {
723
+ "epoch": 0.61,
724
+ "grad_norm": 0.9337642788887024,
725
+ "learning_rate": 9.910546599574903e-06,
726
+ "loss": 0.7314,
727
+ "step": 980
728
+ },
729
+ {
730
+ "epoch": 0.62,
731
+ "grad_norm": 0.6616548299789429,
732
+ "learning_rate": 9.908679003919856e-06,
733
+ "loss": 0.7549,
734
+ "step": 990
735
+ },
736
+ {
737
+ "epoch": 0.63,
738
+ "grad_norm": 0.6958469152450562,
739
+ "learning_rate": 9.906792293097194e-06,
740
+ "loss": 0.7524,
741
+ "step": 1000
742
+ },
743
+ {
744
+ "epoch": 0.63,
745
+ "eval_loss": 0.7200015187263489,
746
+ "eval_runtime": 68.2079,
747
+ "eval_samples_per_second": 65.975,
748
+ "eval_steps_per_second": 4.134,
749
+ "step": 1000
750
+ },
751
+ {
752
+ "epoch": 0.63,
753
+ "grad_norm": 0.5352278351783752,
754
+ "learning_rate": 9.904886474454063e-06,
755
+ "loss": 0.7218,
756
+ "step": 1010
757
+ },
758
+ {
759
+ "epoch": 0.64,
760
+ "grad_norm": 0.6772333979606628,
761
+ "learning_rate": 9.90296155541202e-06,
762
+ "loss": 0.7171,
763
+ "step": 1020
764
+ },
765
+ {
766
+ "epoch": 0.65,
767
+ "grad_norm": 0.7102545499801636,
768
+ "learning_rate": 9.901017543467005e-06,
769
+ "loss": 0.758,
770
+ "step": 1030
771
+ },
772
+ {
773
+ "epoch": 0.65,
774
+ "grad_norm": 0.5379916429519653,
775
+ "learning_rate": 9.899054446189305e-06,
776
+ "loss": 0.7121,
777
+ "step": 1040
778
+ },
779
+ {
780
+ "epoch": 0.66,
781
+ "grad_norm": 0.6267510056495667,
782
+ "learning_rate": 9.897072271223526e-06,
783
+ "loss": 0.7088,
784
+ "step": 1050
785
+ },
786
+ {
787
+ "epoch": 0.66,
788
+ "grad_norm": 0.9392660856246948,
789
+ "learning_rate": 9.895071026288574e-06,
790
+ "loss": 0.7804,
791
+ "step": 1060
792
+ },
793
+ {
794
+ "epoch": 0.67,
795
+ "grad_norm": 0.9270221590995789,
796
+ "learning_rate": 9.893050719177608e-06,
797
+ "loss": 0.6935,
798
+ "step": 1070
799
+ },
800
+ {
801
+ "epoch": 0.68,
802
+ "grad_norm": 0.722115159034729,
803
+ "learning_rate": 9.891011357758022e-06,
804
+ "loss": 0.6894,
805
+ "step": 1080
806
+ },
807
+ {
808
+ "epoch": 0.68,
809
+ "grad_norm": 0.7055147886276245,
810
+ "learning_rate": 9.888952949971411e-06,
811
+ "loss": 0.7244,
812
+ "step": 1090
813
+ },
814
+ {
815
+ "epoch": 0.69,
816
+ "grad_norm": 0.7774051427841187,
817
+ "learning_rate": 9.886875503833537e-06,
818
+ "loss": 0.8156,
819
+ "step": 1100
820
+ },
821
+ {
822
+ "epoch": 0.7,
823
+ "grad_norm": 0.7965037226676941,
824
+ "learning_rate": 9.884779027434304e-06,
825
+ "loss": 0.7478,
826
+ "step": 1110
827
+ },
828
+ {
829
+ "epoch": 0.7,
830
+ "grad_norm": 0.8204682469367981,
831
+ "learning_rate": 9.882663528937716e-06,
832
+ "loss": 0.7187,
833
+ "step": 1120
834
+ },
835
+ {
836
+ "epoch": 0.71,
837
+ "grad_norm": 0.63904869556427,
838
+ "learning_rate": 9.880529016581863e-06,
839
+ "loss": 0.7145,
840
+ "step": 1130
841
+ },
842
+ {
843
+ "epoch": 0.72,
844
+ "grad_norm": 0.6523028612136841,
845
+ "learning_rate": 9.878375498678869e-06,
846
+ "loss": 0.731,
847
+ "step": 1140
848
+ },
849
+ {
850
+ "epoch": 0.72,
851
+ "grad_norm": 0.7148768901824951,
852
+ "learning_rate": 9.876202983614868e-06,
853
+ "loss": 0.7323,
854
+ "step": 1150
855
+ },
856
+ {
857
+ "epoch": 0.73,
858
+ "grad_norm": 0.6108402013778687,
859
+ "learning_rate": 9.874011479849981e-06,
860
+ "loss": 0.6757,
861
+ "step": 1160
862
+ },
863
+ {
864
+ "epoch": 0.73,
865
+ "grad_norm": 0.6056957244873047,
866
+ "learning_rate": 9.871800995918264e-06,
867
+ "loss": 0.7258,
868
+ "step": 1170
869
+ },
870
+ {
871
+ "epoch": 0.74,
872
+ "grad_norm": 0.7671077847480774,
873
+ "learning_rate": 9.86957154042769e-06,
874
+ "loss": 0.7334,
875
+ "step": 1180
876
+ },
877
+ {
878
+ "epoch": 0.75,
879
+ "grad_norm": 0.8327913880348206,
880
+ "learning_rate": 9.867323122060108e-06,
881
+ "loss": 0.7358,
882
+ "step": 1190
883
+ },
884
+ {
885
+ "epoch": 0.75,
886
+ "grad_norm": 0.7025701999664307,
887
+ "learning_rate": 9.865055749571215e-06,
888
+ "loss": 0.6976,
889
+ "step": 1200
890
+ },
891
+ {
892
+ "epoch": 0.75,
893
+ "eval_loss": 0.7151169180870056,
894
+ "eval_runtime": 64.9708,
895
+ "eval_samples_per_second": 69.262,
896
+ "eval_steps_per_second": 4.34,
897
+ "step": 1200
898
+ },
899
+ {
900
+ "epoch": 0.76,
901
+ "grad_norm": 0.8391425609588623,
902
+ "learning_rate": 9.862769431790513e-06,
903
+ "loss": 0.6983,
904
+ "step": 1210
905
+ },
906
+ {
907
+ "epoch": 0.77,
908
+ "grad_norm": 0.7243052124977112,
909
+ "learning_rate": 9.860464177621286e-06,
910
+ "loss": 0.7171,
911
+ "step": 1220
912
+ },
913
+ {
914
+ "epoch": 0.77,
915
+ "grad_norm": 0.6501705050468445,
916
+ "learning_rate": 9.858139996040554e-06,
917
+ "loss": 0.7206,
918
+ "step": 1230
919
+ },
920
+ {
921
+ "epoch": 0.78,
922
+ "grad_norm": 0.8618900179862976,
923
+ "learning_rate": 9.855796896099044e-06,
924
+ "loss": 0.7368,
925
+ "step": 1240
926
+ },
927
+ {
928
+ "epoch": 0.78,
929
+ "grad_norm": 0.826347291469574,
930
+ "learning_rate": 9.85343488692116e-06,
931
+ "loss": 0.7372,
932
+ "step": 1250
933
+ },
934
+ {
935
+ "epoch": 0.79,
936
+ "grad_norm": 0.8590556383132935,
937
+ "learning_rate": 9.851053977704931e-06,
938
+ "loss": 0.7373,
939
+ "step": 1260
940
+ },
941
+ {
942
+ "epoch": 0.8,
943
+ "grad_norm": 0.8719233274459839,
944
+ "learning_rate": 9.848654177721999e-06,
945
+ "loss": 0.7608,
946
+ "step": 1270
947
+ },
948
+ {
949
+ "epoch": 0.8,
950
+ "grad_norm": 0.7729814052581787,
951
+ "learning_rate": 9.846235496317556e-06,
952
+ "loss": 0.7227,
953
+ "step": 1280
954
+ },
955
+ {
956
+ "epoch": 0.81,
957
+ "grad_norm": 0.801908016204834,
958
+ "learning_rate": 9.843797942910328e-06,
959
+ "loss": 0.7415,
960
+ "step": 1290
961
+ },
962
+ {
963
+ "epoch": 0.82,
964
+ "grad_norm": 0.9884589910507202,
965
+ "learning_rate": 9.841341526992536e-06,
966
+ "loss": 0.7206,
967
+ "step": 1300
968
+ },
969
+ {
970
+ "epoch": 0.82,
971
+ "grad_norm": 0.7067356705665588,
972
+ "learning_rate": 9.838866258129847e-06,
973
+ "loss": 0.6704,
974
+ "step": 1310
975
+ },
976
+ {
977
+ "epoch": 0.83,
978
+ "grad_norm": 0.7258339524269104,
979
+ "learning_rate": 9.836372145961346e-06,
980
+ "loss": 0.7159,
981
+ "step": 1320
982
+ },
983
+ {
984
+ "epoch": 0.83,
985
+ "grad_norm": 0.8512592315673828,
986
+ "learning_rate": 9.833859200199498e-06,
987
+ "loss": 0.6916,
988
+ "step": 1330
989
+ },
990
+ {
991
+ "epoch": 0.84,
992
+ "grad_norm": 0.7856159210205078,
993
+ "learning_rate": 9.83132743063011e-06,
994
+ "loss": 0.7568,
995
+ "step": 1340
996
+ },
997
+ {
998
+ "epoch": 0.85,
999
+ "grad_norm": 0.7149519324302673,
1000
+ "learning_rate": 9.82877684711229e-06,
1001
+ "loss": 0.7017,
1002
+ "step": 1350
1003
+ },
1004
+ {
1005
+ "epoch": 0.85,
1006
+ "grad_norm": 1.0214589834213257,
1007
+ "learning_rate": 9.826207459578412e-06,
1008
+ "loss": 0.7127,
1009
+ "step": 1360
1010
+ },
1011
+ {
1012
+ "epoch": 0.86,
1013
+ "grad_norm": 1.0295792818069458,
1014
+ "learning_rate": 9.823619278034073e-06,
1015
+ "loss": 0.7013,
1016
+ "step": 1370
1017
+ },
1018
+ {
1019
+ "epoch": 0.87,
1020
+ "grad_norm": 0.8674212694168091,
1021
+ "learning_rate": 9.821012312558059e-06,
1022
+ "loss": 0.6942,
1023
+ "step": 1380
1024
+ },
1025
+ {
1026
+ "epoch": 0.87,
1027
+ "grad_norm": 0.7604880332946777,
1028
+ "learning_rate": 9.818386573302305e-06,
1029
+ "loss": 0.7013,
1030
+ "step": 1390
1031
+ },
1032
+ {
1033
+ "epoch": 0.88,
1034
+ "grad_norm": 0.7863268852233887,
1035
+ "learning_rate": 9.815742070491852e-06,
1036
+ "loss": 0.7408,
1037
+ "step": 1400
1038
+ },
1039
+ {
1040
+ "epoch": 0.88,
1041
+ "eval_loss": 0.7116020917892456,
1042
+ "eval_runtime": 64.7496,
1043
+ "eval_samples_per_second": 69.498,
1044
+ "eval_steps_per_second": 4.355,
1045
+ "step": 1400
1046
+ },
1047
+ {
1048
+ "epoch": 0.88,
1049
+ "grad_norm": 0.7451047301292419,
1050
+ "learning_rate": 9.81307881442481e-06,
1051
+ "loss": 0.7105,
1052
+ "step": 1410
1053
+ },
1054
+ {
1055
+ "epoch": 0.89,
1056
+ "grad_norm": 0.8191768527030945,
1057
+ "learning_rate": 9.810396815472316e-06,
1058
+ "loss": 0.6994,
1059
+ "step": 1420
1060
+ },
1061
+ {
1062
+ "epoch": 0.9,
1063
+ "grad_norm": 0.5049307942390442,
1064
+ "learning_rate": 9.807696084078494e-06,
1065
+ "loss": 0.7459,
1066
+ "step": 1430
1067
+ },
1068
+ {
1069
+ "epoch": 0.9,
1070
+ "grad_norm": 0.762649416923523,
1071
+ "learning_rate": 9.804976630760419e-06,
1072
+ "loss": 0.7048,
1073
+ "step": 1440
1074
+ },
1075
+ {
1076
+ "epoch": 0.91,
1077
+ "grad_norm": 0.8065420985221863,
1078
+ "learning_rate": 9.802238466108068e-06,
1079
+ "loss": 0.6975,
1080
+ "step": 1450
1081
+ },
1082
+ {
1083
+ "epoch": 0.92,
1084
+ "grad_norm": 0.899728000164032,
1085
+ "learning_rate": 9.799481600784286e-06,
1086
+ "loss": 0.737,
1087
+ "step": 1460
1088
+ },
1089
+ {
1090
+ "epoch": 0.92,
1091
+ "grad_norm": 0.7029632925987244,
1092
+ "learning_rate": 9.796706045524738e-06,
1093
+ "loss": 0.7236,
1094
+ "step": 1470
1095
+ },
1096
+ {
1097
+ "epoch": 0.93,
1098
+ "grad_norm": 0.7470441460609436,
1099
+ "learning_rate": 9.793911811137874e-06,
1100
+ "loss": 0.6984,
1101
+ "step": 1480
1102
+ },
1103
+ {
1104
+ "epoch": 0.93,
1105
+ "grad_norm": 0.8542289137840271,
1106
+ "learning_rate": 9.791098908504884e-06,
1107
+ "loss": 0.8019,
1108
+ "step": 1490
1109
+ },
1110
+ {
1111
+ "epoch": 0.94,
1112
+ "grad_norm": 0.749045193195343,
1113
+ "learning_rate": 9.788267348579649e-06,
1114
+ "loss": 0.7114,
1115
+ "step": 1500
1116
+ },
1117
+ {
1118
+ "epoch": 0.95,
1119
+ "grad_norm": 0.7834633588790894,
1120
+ "learning_rate": 9.78541714238871e-06,
1121
+ "loss": 0.7222,
1122
+ "step": 1510
1123
+ },
1124
+ {
1125
+ "epoch": 0.95,
1126
+ "grad_norm": 0.8488750457763672,
1127
+ "learning_rate": 9.782548301031218e-06,
1128
+ "loss": 0.7434,
1129
+ "step": 1520
1130
+ },
1131
+ {
1132
+ "epoch": 0.96,
1133
+ "grad_norm": 0.7018651962280273,
1134
+ "learning_rate": 9.77966083567889e-06,
1135
+ "loss": 0.7193,
1136
+ "step": 1530
1137
+ },
1138
+ {
1139
+ "epoch": 0.97,
1140
+ "grad_norm": 0.8260754346847534,
1141
+ "learning_rate": 9.776754757575975e-06,
1142
+ "loss": 0.7763,
1143
+ "step": 1540
1144
+ },
1145
+ {
1146
+ "epoch": 0.97,
1147
+ "grad_norm": 0.8732118010520935,
1148
+ "learning_rate": 9.773830078039193e-06,
1149
+ "loss": 0.7494,
1150
+ "step": 1550
1151
+ },
1152
+ {
1153
+ "epoch": 0.98,
1154
+ "grad_norm": 0.9026480317115784,
1155
+ "learning_rate": 9.77088680845771e-06,
1156
+ "loss": 0.7078,
1157
+ "step": 1560
1158
+ },
1159
+ {
1160
+ "epoch": 0.98,
1161
+ "grad_norm": 0.7559505105018616,
1162
+ "learning_rate": 9.767924960293076e-06,
1163
+ "loss": 0.7468,
1164
+ "step": 1570
1165
+ },
1166
+ {
1167
+ "epoch": 0.99,
1168
+ "grad_norm": 0.8832489848136902,
1169
+ "learning_rate": 9.764944545079197e-06,
1170
+ "loss": 0.7502,
1171
+ "step": 1580
1172
+ },
1173
+ {
1174
+ "epoch": 1.0,
1175
+ "grad_norm": 0.8065813183784485,
1176
+ "learning_rate": 9.761945574422276e-06,
1177
+ "loss": 0.7337,
1178
+ "step": 1590
1179
+ },
1180
+ {
1181
+ "epoch": 1.0,
1182
+ "grad_norm": 0.6966451406478882,
1183
+ "learning_rate": 9.758928060000779e-06,
1184
+ "loss": 0.701,
1185
+ "step": 1600
1186
+ },
1187
+ {
1188
+ "epoch": 1.0,
1189
+ "eval_loss": 0.7084596157073975,
1190
+ "eval_runtime": 64.9175,
1191
+ "eval_samples_per_second": 69.319,
1192
+ "eval_steps_per_second": 4.344,
1193
+ "step": 1600
1194
+ },
1195
+ {
1196
+ "epoch": 1.01,
1197
+ "grad_norm": 0.8769924640655518,
1198
+ "learning_rate": 9.755892013565377e-06,
1199
+ "loss": 0.7014,
1200
+ "step": 1610
1201
+ },
1202
+ {
1203
+ "epoch": 1.02,
1204
+ "grad_norm": 0.8940397500991821,
1205
+ "learning_rate": 9.752837446938915e-06,
1206
+ "loss": 0.7256,
1207
+ "step": 1620
1208
+ },
1209
+ {
1210
+ "epoch": 1.02,
1211
+ "grad_norm": 0.7818279266357422,
1212
+ "learning_rate": 9.749764372016355e-06,
1213
+ "loss": 0.7268,
1214
+ "step": 1630
1215
+ },
1216
+ {
1217
+ "epoch": 1.03,
1218
+ "grad_norm": 0.7369450330734253,
1219
+ "learning_rate": 9.746672800764734e-06,
1220
+ "loss": 0.6968,
1221
+ "step": 1640
1222
+ },
1223
+ {
1224
+ "epoch": 1.04,
1225
+ "grad_norm": 0.8924703001976013,
1226
+ "learning_rate": 9.743562745223118e-06,
1227
+ "loss": 0.7087,
1228
+ "step": 1650
1229
+ },
1230
+ {
1231
+ "epoch": 1.04,
1232
+ "grad_norm": 1.0398907661437988,
1233
+ "learning_rate": 9.740434217502549e-06,
1234
+ "loss": 0.7199,
1235
+ "step": 1660
1236
+ },
1237
+ {
1238
+ "epoch": 1.05,
1239
+ "grad_norm": 0.7427188754081726,
1240
+ "learning_rate": 9.737287229786007e-06,
1241
+ "loss": 0.687,
1242
+ "step": 1670
1243
+ },
1244
+ {
1245
+ "epoch": 1.05,
1246
+ "grad_norm": 0.9230946898460388,
1247
+ "learning_rate": 9.734121794328358e-06,
1248
+ "loss": 0.7003,
1249
+ "step": 1680
1250
+ },
1251
+ {
1252
+ "epoch": 1.06,
1253
+ "grad_norm": 0.8461260795593262,
1254
+ "learning_rate": 9.730937923456303e-06,
1255
+ "loss": 0.7329,
1256
+ "step": 1690
1257
+ },
1258
+ {
1259
+ "epoch": 1.07,
1260
+ "grad_norm": 0.783156156539917,
1261
+ "learning_rate": 9.727735629568335e-06,
1262
+ "loss": 0.6924,
1263
+ "step": 1700
1264
+ },
1265
+ {
1266
+ "epoch": 1.07,
1267
+ "grad_norm": 0.8659111261367798,
1268
+ "learning_rate": 9.724514925134696e-06,
1269
+ "loss": 0.7219,
1270
+ "step": 1710
1271
+ },
1272
+ {
1273
+ "epoch": 1.08,
1274
+ "grad_norm": 0.8218225240707397,
1275
+ "learning_rate": 9.721275822697307e-06,
1276
+ "loss": 0.6741,
1277
+ "step": 1720
1278
+ },
1279
+ {
1280
+ "epoch": 1.09,
1281
+ "grad_norm": 0.8807560205459595,
1282
+ "learning_rate": 9.718018334869748e-06,
1283
+ "loss": 0.7047,
1284
+ "step": 1730
1285
+ },
1286
+ {
1287
+ "epoch": 1.09,
1288
+ "grad_norm": 0.9925751090049744,
1289
+ "learning_rate": 9.714742474337187e-06,
1290
+ "loss": 0.7156,
1291
+ "step": 1740
1292
+ },
1293
+ {
1294
+ "epoch": 1.1,
1295
+ "grad_norm": 0.7675251364707947,
1296
+ "learning_rate": 9.711448253856336e-06,
1297
+ "loss": 0.6887,
1298
+ "step": 1750
1299
+ },
1300
+ {
1301
+ "epoch": 1.1,
1302
+ "grad_norm": 0.783015251159668,
1303
+ "learning_rate": 9.708135686255415e-06,
1304
+ "loss": 0.7373,
1305
+ "step": 1760
1306
+ },
1307
+ {
1308
+ "epoch": 1.11,
1309
+ "grad_norm": 0.8704028129577637,
1310
+ "learning_rate": 9.704804784434077e-06,
1311
+ "loss": 0.6652,
1312
+ "step": 1770
1313
+ },
1314
+ {
1315
+ "epoch": 1.12,
1316
+ "grad_norm": 0.9532449245452881,
1317
+ "learning_rate": 9.701455561363378e-06,
1318
+ "loss": 0.682,
1319
+ "step": 1780
1320
+ },
1321
+ {
1322
+ "epoch": 1.12,
1323
+ "grad_norm": 0.9703534245491028,
1324
+ "learning_rate": 9.698088030085721e-06,
1325
+ "loss": 0.6844,
1326
+ "step": 1790
1327
+ },
1328
+ {
1329
+ "epoch": 1.13,
1330
+ "grad_norm": 1.031153678894043,
1331
+ "learning_rate": 9.694702203714801e-06,
1332
+ "loss": 0.7084,
1333
+ "step": 1800
1334
+ },
1335
+ {
1336
+ "epoch": 1.13,
1337
+ "eval_loss": 0.705936074256897,
1338
+ "eval_runtime": 64.9167,
1339
+ "eval_samples_per_second": 69.32,
1340
+ "eval_steps_per_second": 4.344,
1341
+ "step": 1800
1342
+ },
1343
+ {
1344
+ "epoch": 1.14,
1345
+ "grad_norm": 0.8839524388313293,
1346
+ "learning_rate": 9.691298095435559e-06,
1347
+ "loss": 0.6897,
1348
+ "step": 1810
1349
+ },
1350
+ {
1351
+ "epoch": 1.14,
1352
+ "grad_norm": 1.0173550844192505,
1353
+ "learning_rate": 9.687875718504126e-06,
1354
+ "loss": 0.6851,
1355
+ "step": 1820
1356
+ },
1357
+ {
1358
+ "epoch": 1.15,
1359
+ "grad_norm": 1.0902131795883179,
1360
+ "learning_rate": 9.684435086247777e-06,
1361
+ "loss": 0.7132,
1362
+ "step": 1830
1363
+ },
1364
+ {
1365
+ "epoch": 1.15,
1366
+ "grad_norm": 0.8699798583984375,
1367
+ "learning_rate": 9.680976212064875e-06,
1368
+ "loss": 0.7129,
1369
+ "step": 1840
1370
+ },
1371
+ {
1372
+ "epoch": 1.16,
1373
+ "grad_norm": 0.879970133304596,
1374
+ "learning_rate": 9.677499109424818e-06,
1375
+ "loss": 0.6907,
1376
+ "step": 1850
1377
+ },
1378
+ {
1379
+ "epoch": 1.17,
1380
+ "grad_norm": 0.9659926295280457,
1381
+ "learning_rate": 9.674003791867993e-06,
1382
+ "loss": 0.7327,
1383
+ "step": 1860
1384
+ },
1385
+ {
1386
+ "epoch": 1.17,
1387
+ "grad_norm": 1.0900288820266724,
1388
+ "learning_rate": 9.670490273005713e-06,
1389
+ "loss": 0.7304,
1390
+ "step": 1870
1391
+ },
1392
+ {
1393
+ "epoch": 1.18,
1394
+ "grad_norm": 0.995785117149353,
1395
+ "learning_rate": 9.666958566520175e-06,
1396
+ "loss": 0.7076,
1397
+ "step": 1880
1398
+ },
1399
+ {
1400
+ "epoch": 1.19,
1401
+ "grad_norm": 1.0170907974243164,
1402
+ "learning_rate": 9.663408686164399e-06,
1403
+ "loss": 0.691,
1404
+ "step": 1890
1405
+ },
1406
+ {
1407
+ "epoch": 1.19,
1408
+ "grad_norm": 1.1418849229812622,
1409
+ "learning_rate": 9.659840645762176e-06,
1410
+ "loss": 0.74,
1411
+ "step": 1900
1412
+ },
1413
+ {
1414
+ "epoch": 1.2,
1415
+ "grad_norm": 0.7200061082839966,
1416
+ "learning_rate": 9.656254459208015e-06,
1417
+ "loss": 0.7295,
1418
+ "step": 1910
1419
+ },
1420
+ {
1421
+ "epoch": 1.2,
1422
+ "grad_norm": 0.9135183095932007,
1423
+ "learning_rate": 9.652650140467094e-06,
1424
+ "loss": 0.651,
1425
+ "step": 1920
1426
+ },
1427
+ {
1428
+ "epoch": 1.21,
1429
+ "grad_norm": 0.9724289774894714,
1430
+ "learning_rate": 9.649027703575193e-06,
1431
+ "loss": 0.7028,
1432
+ "step": 1930
1433
+ },
1434
+ {
1435
+ "epoch": 1.22,
1436
+ "grad_norm": 0.8180338740348816,
1437
+ "learning_rate": 9.645387162638652e-06,
1438
+ "loss": 0.7179,
1439
+ "step": 1940
1440
+ },
1441
+ {
1442
+ "epoch": 1.22,
1443
+ "grad_norm": 1.089158296585083,
1444
+ "learning_rate": 9.641728531834313e-06,
1445
+ "loss": 0.6872,
1446
+ "step": 1950
1447
+ },
1448
+ {
1449
+ "epoch": 1.23,
1450
+ "grad_norm": 1.0048317909240723,
1451
+ "learning_rate": 9.638051825409454e-06,
1452
+ "loss": 0.6991,
1453
+ "step": 1960
1454
+ },
1455
+ {
1456
+ "epoch": 1.24,
1457
+ "grad_norm": 1.1580454111099243,
1458
+ "learning_rate": 9.634357057681749e-06,
1459
+ "loss": 0.7183,
1460
+ "step": 1970
1461
+ },
1462
+ {
1463
+ "epoch": 1.24,
1464
+ "grad_norm": 1.0045746564865112,
1465
+ "learning_rate": 9.630644243039207e-06,
1466
+ "loss": 0.6795,
1467
+ "step": 1980
1468
+ },
1469
+ {
1470
+ "epoch": 1.25,
1471
+ "grad_norm": 0.9629393815994263,
1472
+ "learning_rate": 9.62691339594011e-06,
1473
+ "loss": 0.7075,
1474
+ "step": 1990
1475
+ },
1476
+ {
1477
+ "epoch": 1.25,
1478
+ "grad_norm": 0.946081280708313,
1479
+ "learning_rate": 9.623164530912963e-06,
1480
+ "loss": 0.6999,
1481
+ "step": 2000
1482
+ },
1483
+ {
1484
+ "epoch": 1.25,
1485
+ "eval_loss": 0.7040402293205261,
1486
+ "eval_runtime": 64.9289,
1487
+ "eval_samples_per_second": 69.307,
1488
+ "eval_steps_per_second": 4.343,
1489
+ "step": 2000
1490
+ },
1491
+ {
1492
+ "epoch": 1.26,
1493
+ "grad_norm": 1.0208806991577148,
1494
+ "learning_rate": 9.619397662556434e-06,
1495
+ "loss": 0.6947,
1496
+ "step": 2010
1497
+ },
1498
+ {
1499
+ "epoch": 1.27,
1500
+ "grad_norm": 1.3248392343521118,
1501
+ "learning_rate": 9.615612805539305e-06,
1502
+ "loss": 0.7102,
1503
+ "step": 2020
1504
+ },
1505
+ {
1506
+ "epoch": 1.27,
1507
+ "grad_norm": 0.9521629810333252,
1508
+ "learning_rate": 9.6118099746004e-06,
1509
+ "loss": 0.7068,
1510
+ "step": 2030
1511
+ },
1512
+ {
1513
+ "epoch": 1.28,
1514
+ "grad_norm": 1.129441738128662,
1515
+ "learning_rate": 9.607989184548544e-06,
1516
+ "loss": 0.6528,
1517
+ "step": 2040
1518
+ },
1519
+ {
1520
+ "epoch": 1.29,
1521
+ "grad_norm": 1.2303441762924194,
1522
+ "learning_rate": 9.604150450262488e-06,
1523
+ "loss": 0.6838,
1524
+ "step": 2050
1525
+ },
1526
+ {
1527
+ "epoch": 1.29,
1528
+ "grad_norm": 1.433111310005188,
1529
+ "learning_rate": 9.600293786690873e-06,
1530
+ "loss": 0.6908,
1531
+ "step": 2060
1532
+ },
1533
+ {
1534
+ "epoch": 1.3,
1535
+ "grad_norm": 1.11778724193573,
1536
+ "learning_rate": 9.596419208852152e-06,
1537
+ "loss": 0.7153,
1538
+ "step": 2070
1539
+ },
1540
+ {
1541
+ "epoch": 1.3,
1542
+ "grad_norm": 1.3464716672897339,
1543
+ "learning_rate": 9.592526731834536e-06,
1544
+ "loss": 0.67,
1545
+ "step": 2080
1546
+ },
1547
+ {
1548
+ "epoch": 1.31,
1549
+ "grad_norm": 1.0811423063278198,
1550
+ "learning_rate": 9.588616370795947e-06,
1551
+ "loss": 0.705,
1552
+ "step": 2090
1553
+ },
1554
+ {
1555
+ "epoch": 1.32,
1556
+ "grad_norm": 1.2497215270996094,
1557
+ "learning_rate": 9.584688140963945e-06,
1558
+ "loss": 0.7037,
1559
+ "step": 2100
1560
+ },
1561
+ {
1562
+ "epoch": 1.32,
1563
+ "grad_norm": 1.0369244813919067,
1564
+ "learning_rate": 9.580742057635672e-06,
1565
+ "loss": 0.7199,
1566
+ "step": 2110
1567
+ },
1568
+ {
1569
+ "epoch": 1.33,
1570
+ "grad_norm": 0.8846107125282288,
1571
+ "learning_rate": 9.576778136177798e-06,
1572
+ "loss": 0.7098,
1573
+ "step": 2120
1574
+ },
1575
+ {
1576
+ "epoch": 1.34,
1577
+ "grad_norm": 1.2424838542938232,
1578
+ "learning_rate": 9.572796392026455e-06,
1579
+ "loss": 0.7109,
1580
+ "step": 2130
1581
+ },
1582
+ {
1583
+ "epoch": 1.34,
1584
+ "grad_norm": 1.1349953413009644,
1585
+ "learning_rate": 9.568796840687184e-06,
1586
+ "loss": 0.693,
1587
+ "step": 2140
1588
+ },
1589
+ {
1590
+ "epoch": 1.35,
1591
+ "grad_norm": 0.9105272889137268,
1592
+ "learning_rate": 9.564779497734864e-06,
1593
+ "loss": 0.6679,
1594
+ "step": 2150
1595
+ },
1596
+ {
1597
+ "epoch": 1.36,
1598
+ "grad_norm": 1.021628737449646,
1599
+ "learning_rate": 9.56074437881366e-06,
1600
+ "loss": 0.6573,
1601
+ "step": 2160
1602
+ },
1603
+ {
1604
+ "epoch": 1.36,
1605
+ "grad_norm": 1.1030464172363281,
1606
+ "learning_rate": 9.55669149963696e-06,
1607
+ "loss": 0.705,
1608
+ "step": 2170
1609
+ },
1610
+ {
1611
+ "epoch": 1.37,
1612
+ "grad_norm": 1.1582733392715454,
1613
+ "learning_rate": 9.552620875987312e-06,
1614
+ "loss": 0.6932,
1615
+ "step": 2180
1616
+ },
1617
+ {
1618
+ "epoch": 1.37,
1619
+ "grad_norm": 1.2710620164871216,
1620
+ "learning_rate": 9.548532523716366e-06,
1621
+ "loss": 0.6616,
1622
+ "step": 2190
1623
+ },
1624
+ {
1625
+ "epoch": 1.38,
1626
+ "grad_norm": 1.1528280973434448,
1627
+ "learning_rate": 9.544426458744805e-06,
1628
+ "loss": 0.7182,
1629
+ "step": 2200
1630
+ },
1631
+ {
1632
+ "epoch": 1.38,
1633
+ "eval_loss": 0.7022137641906738,
1634
+ "eval_runtime": 64.965,
1635
+ "eval_samples_per_second": 69.268,
1636
+ "eval_steps_per_second": 4.341,
1637
+ "step": 2200
1638
+ },
1639
+ {
1640
+ "epoch": 1.39,
1641
+ "grad_norm": 1.1455330848693848,
1642
+ "learning_rate": 9.540302697062294e-06,
1643
+ "loss": 0.6878,
1644
+ "step": 2210
1645
+ },
1646
+ {
1647
+ "epoch": 1.39,
1648
+ "grad_norm": 1.4521374702453613,
1649
+ "learning_rate": 9.536161254727407e-06,
1650
+ "loss": 0.6979,
1651
+ "step": 2220
1652
+ },
1653
+ {
1654
+ "epoch": 1.4,
1655
+ "grad_norm": 1.4062340259552002,
1656
+ "learning_rate": 9.532002147867575e-06,
1657
+ "loss": 0.6749,
1658
+ "step": 2230
1659
+ },
1660
+ {
1661
+ "epoch": 1.41,
1662
+ "grad_norm": 1.0267623662948608,
1663
+ "learning_rate": 9.527825392679012e-06,
1664
+ "loss": 0.6987,
1665
+ "step": 2240
1666
+ },
1667
+ {
1668
+ "epoch": 1.41,
1669
+ "grad_norm": 1.0981144905090332,
1670
+ "learning_rate": 9.523631005426658e-06,
1671
+ "loss": 0.6888,
1672
+ "step": 2250
1673
+ },
1674
+ {
1675
+ "epoch": 1.42,
1676
+ "grad_norm": 1.0353021621704102,
1677
+ "learning_rate": 9.51941900244412e-06,
1678
+ "loss": 0.6471,
1679
+ "step": 2260
1680
+ },
1681
+ {
1682
+ "epoch": 1.42,
1683
+ "grad_norm": 1.1088558435440063,
1684
+ "learning_rate": 9.515189400133594e-06,
1685
+ "loss": 0.6689,
1686
+ "step": 2270
1687
+ },
1688
+ {
1689
+ "epoch": 1.43,
1690
+ "grad_norm": 1.1822565793991089,
1691
+ "learning_rate": 9.510942214965819e-06,
1692
+ "loss": 0.7001,
1693
+ "step": 2280
1694
+ },
1695
+ {
1696
+ "epoch": 1.44,
1697
+ "grad_norm": 1.2247307300567627,
1698
+ "learning_rate": 9.506677463480003e-06,
1699
+ "loss": 0.6999,
1700
+ "step": 2290
1701
+ },
1702
+ {
1703
+ "epoch": 1.44,
1704
+ "grad_norm": 1.163528323173523,
1705
+ "learning_rate": 9.50239516228376e-06,
1706
+ "loss": 0.7008,
1707
+ "step": 2300
1708
+ },
1709
+ {
1710
+ "epoch": 1.45,
1711
+ "grad_norm": 1.2677900791168213,
1712
+ "learning_rate": 9.49809532805304e-06,
1713
+ "loss": 0.7122,
1714
+ "step": 2310
1715
+ },
1716
+ {
1717
+ "epoch": 1.46,
1718
+ "grad_norm": 1.1475526094436646,
1719
+ "learning_rate": 9.493777977532072e-06,
1720
+ "loss": 0.7106,
1721
+ "step": 2320
1722
+ },
1723
+ {
1724
+ "epoch": 1.46,
1725
+ "grad_norm": 1.1459851264953613,
1726
+ "learning_rate": 9.489443127533304e-06,
1727
+ "loss": 0.6739,
1728
+ "step": 2330
1729
+ },
1730
+ {
1731
+ "epoch": 1.47,
1732
+ "grad_norm": 1.2973495721817017,
1733
+ "learning_rate": 9.485090794937319e-06,
1734
+ "loss": 0.6888,
1735
+ "step": 2340
1736
+ },
1737
+ {
1738
+ "epoch": 1.47,
1739
+ "grad_norm": 1.0322624444961548,
1740
+ "learning_rate": 9.480720996692783e-06,
1741
+ "loss": 0.6986,
1742
+ "step": 2350
1743
+ },
1744
+ {
1745
+ "epoch": 1.48,
1746
+ "grad_norm": 1.407605767250061,
1747
+ "learning_rate": 9.476333749816382e-06,
1748
+ "loss": 0.7314,
1749
+ "step": 2360
1750
+ },
1751
+ {
1752
+ "epoch": 1.49,
1753
+ "grad_norm": 1.1082048416137695,
1754
+ "learning_rate": 9.47192907139274e-06,
1755
+ "loss": 0.6602,
1756
+ "step": 2370
1757
+ },
1758
+ {
1759
+ "epoch": 1.49,
1760
+ "grad_norm": 1.02568519115448,
1761
+ "learning_rate": 9.46750697857437e-06,
1762
+ "loss": 0.6454,
1763
+ "step": 2380
1764
+ },
1765
+ {
1766
+ "epoch": 1.5,
1767
+ "grad_norm": 1.12267005443573,
1768
+ "learning_rate": 9.463067488581598e-06,
1769
+ "loss": 0.6499,
1770
+ "step": 2390
1771
+ },
1772
+ {
1773
+ "epoch": 1.51,
1774
+ "grad_norm": 1.0023943185806274,
1775
+ "learning_rate": 9.45861061870249e-06,
1776
+ "loss": 0.7267,
1777
+ "step": 2400
1778
+ },
1779
+ {
1780
+ "epoch": 1.51,
1781
+ "eval_loss": 0.6993948817253113,
1782
+ "eval_runtime": 64.9272,
1783
+ "eval_samples_per_second": 69.308,
1784
+ "eval_steps_per_second": 4.343,
1785
+ "step": 2400
1786
+ },
1787
+ {
1788
+ "epoch": 1.51,
1789
+ "grad_norm": 1.2597460746765137,
1790
+ "learning_rate": 9.454136386292804e-06,
1791
+ "loss": 0.6934,
1792
+ "step": 2410
1793
+ },
1794
+ {
1795
+ "epoch": 1.52,
1796
+ "grad_norm": 1.293137788772583,
1797
+ "learning_rate": 9.449644808775902e-06,
1798
+ "loss": 0.7095,
1799
+ "step": 2420
1800
+ },
1801
+ {
1802
+ "epoch": 1.52,
1803
+ "grad_norm": 1.0400352478027344,
1804
+ "learning_rate": 9.445135903642693e-06,
1805
+ "loss": 0.6626,
1806
+ "step": 2430
1807
+ },
1808
+ {
1809
+ "epoch": 1.53,
1810
+ "grad_norm": 1.0873581171035767,
1811
+ "learning_rate": 9.440609688451561e-06,
1812
+ "loss": 0.6513,
1813
+ "step": 2440
1814
+ },
1815
+ {
1816
+ "epoch": 1.54,
1817
+ "grad_norm": 1.0420424938201904,
1818
+ "learning_rate": 9.4360661808283e-06,
1819
+ "loss": 0.711,
1820
+ "step": 2450
1821
+ },
1822
+ {
1823
+ "epoch": 1.54,
1824
+ "grad_norm": 1.3502894639968872,
1825
+ "learning_rate": 9.431505398466045e-06,
1826
+ "loss": 0.6991,
1827
+ "step": 2460
1828
+ },
1829
+ {
1830
+ "epoch": 1.55,
1831
+ "grad_norm": 1.3502726554870605,
1832
+ "learning_rate": 9.426927359125195e-06,
1833
+ "loss": 0.7073,
1834
+ "step": 2470
1835
+ },
1836
+ {
1837
+ "epoch": 1.56,
1838
+ "grad_norm": 1.2768748998641968,
1839
+ "learning_rate": 9.422332080633361e-06,
1840
+ "loss": 0.6557,
1841
+ "step": 2480
1842
+ },
1843
+ {
1844
+ "epoch": 1.56,
1845
+ "grad_norm": 1.1925798654556274,
1846
+ "learning_rate": 9.417719580885275e-06,
1847
+ "loss": 0.6786,
1848
+ "step": 2490
1849
+ },
1850
+ {
1851
+ "epoch": 1.57,
1852
+ "grad_norm": 0.9290177822113037,
1853
+ "learning_rate": 9.413089877842735e-06,
1854
+ "loss": 0.6159,
1855
+ "step": 2500
1856
+ },
1857
+ {
1858
+ "epoch": 1.57,
1859
+ "grad_norm": 1.3553310632705688,
1860
+ "learning_rate": 9.408442989534536e-06,
1861
+ "loss": 0.7341,
1862
+ "step": 2510
1863
+ },
1864
+ {
1865
+ "epoch": 1.58,
1866
+ "grad_norm": 0.9777106642723083,
1867
+ "learning_rate": 9.403778934056392e-06,
1868
+ "loss": 0.6737,
1869
+ "step": 2520
1870
+ },
1871
+ {
1872
+ "epoch": 1.59,
1873
+ "grad_norm": 1.47153902053833,
1874
+ "learning_rate": 9.399097729570865e-06,
1875
+ "loss": 0.6832,
1876
+ "step": 2530
1877
+ },
1878
+ {
1879
+ "epoch": 1.59,
1880
+ "grad_norm": 1.2370259761810303,
1881
+ "learning_rate": 9.394399394307303e-06,
1882
+ "loss": 0.6691,
1883
+ "step": 2540
1884
+ },
1885
+ {
1886
+ "epoch": 1.6,
1887
+ "grad_norm": 1.2009457349777222,
1888
+ "learning_rate": 9.38968394656176e-06,
1889
+ "loss": 0.7072,
1890
+ "step": 2550
1891
+ },
1892
+ {
1893
+ "epoch": 1.61,
1894
+ "grad_norm": 1.095410704612732,
1895
+ "learning_rate": 9.384951404696933e-06,
1896
+ "loss": 0.7068,
1897
+ "step": 2560
1898
+ },
1899
+ {
1900
+ "epoch": 1.61,
1901
+ "grad_norm": 1.0805617570877075,
1902
+ "learning_rate": 9.380201787142085e-06,
1903
+ "loss": 0.6476,
1904
+ "step": 2570
1905
+ },
1906
+ {
1907
+ "epoch": 1.62,
1908
+ "grad_norm": 1.3433113098144531,
1909
+ "learning_rate": 9.37543511239297e-06,
1910
+ "loss": 0.6805,
1911
+ "step": 2580
1912
+ },
1913
+ {
1914
+ "epoch": 1.62,
1915
+ "grad_norm": 1.3151830434799194,
1916
+ "learning_rate": 9.370651399011769e-06,
1917
+ "loss": 0.6887,
1918
+ "step": 2590
1919
+ },
1920
+ {
1921
+ "epoch": 1.63,
1922
+ "grad_norm": 1.3367606401443481,
1923
+ "learning_rate": 9.365850665627016e-06,
1924
+ "loss": 0.6912,
1925
+ "step": 2600
1926
+ },
1927
+ {
1928
+ "epoch": 1.63,
1929
+ "eval_loss": 0.6971801519393921,
1930
+ "eval_runtime": 65.0021,
1931
+ "eval_samples_per_second": 69.229,
1932
+ "eval_steps_per_second": 4.338,
1933
+ "step": 2600
1934
+ },
1935
+ {
1936
+ "epoch": 1.64,
1937
+ "grad_norm": 1.3351305723190308,
1938
+ "learning_rate": 9.36103293093352e-06,
1939
+ "loss": 0.6479,
1940
+ "step": 2610
1941
+ },
1942
+ {
1943
+ "epoch": 1.64,
1944
+ "grad_norm": 1.3986787796020508,
1945
+ "learning_rate": 9.356198213692297e-06,
1946
+ "loss": 0.6788,
1947
+ "step": 2620
1948
+ },
1949
+ {
1950
+ "epoch": 1.65,
1951
+ "grad_norm": 1.0550477504730225,
1952
+ "learning_rate": 9.351346532730499e-06,
1953
+ "loss": 0.6481,
1954
+ "step": 2630
1955
+ },
1956
+ {
1957
+ "epoch": 1.66,
1958
+ "grad_norm": 1.275985836982727,
1959
+ "learning_rate": 9.346477906941331e-06,
1960
+ "loss": 0.6893,
1961
+ "step": 2640
1962
+ },
1963
+ {
1964
+ "epoch": 1.66,
1965
+ "grad_norm": 1.441588044166565,
1966
+ "learning_rate": 9.341592355283986e-06,
1967
+ "loss": 0.6784,
1968
+ "step": 2650
1969
+ },
1970
+ {
1971
+ "epoch": 1.67,
1972
+ "grad_norm": 1.2504793405532837,
1973
+ "learning_rate": 9.336689896783575e-06,
1974
+ "loss": 0.6834,
1975
+ "step": 2660
1976
+ },
1977
+ {
1978
+ "epoch": 1.68,
1979
+ "grad_norm": 1.2592806816101074,
1980
+ "learning_rate": 9.331770550531037e-06,
1981
+ "loss": 0.6701,
1982
+ "step": 2670
1983
+ },
1984
+ {
1985
+ "epoch": 1.68,
1986
+ "grad_norm": 1.494611382484436,
1987
+ "learning_rate": 9.32683433568308e-06,
1988
+ "loss": 0.6691,
1989
+ "step": 2680
1990
+ },
1991
+ {
1992
+ "epoch": 1.69,
1993
+ "grad_norm": 1.2938275337219238,
1994
+ "learning_rate": 9.321881271462104e-06,
1995
+ "loss": 0.6818,
1996
+ "step": 2690
1997
+ },
1998
+ {
1999
+ "epoch": 1.69,
2000
+ "grad_norm": 1.5548397302627563,
2001
+ "learning_rate": 9.316911377156116e-06,
2002
+ "loss": 0.6852,
2003
+ "step": 2700
2004
+ },
2005
+ {
2006
+ "epoch": 1.7,
2007
+ "grad_norm": 1.2488983869552612,
2008
+ "learning_rate": 9.31192467211867e-06,
2009
+ "loss": 0.6653,
2010
+ "step": 2710
2011
+ },
2012
+ {
2013
+ "epoch": 1.71,
2014
+ "grad_norm": 1.3493934869766235,
2015
+ "learning_rate": 9.306921175768776e-06,
2016
+ "loss": 0.6671,
2017
+ "step": 2720
2018
+ },
2019
+ {
2020
+ "epoch": 1.71,
2021
+ "grad_norm": 1.388487696647644,
2022
+ "learning_rate": 9.301900907590836e-06,
2023
+ "loss": 0.7066,
2024
+ "step": 2730
2025
+ },
2026
+ {
2027
+ "epoch": 1.72,
2028
+ "grad_norm": 1.2521592378616333,
2029
+ "learning_rate": 9.296863887134561e-06,
2030
+ "loss": 0.7326,
2031
+ "step": 2740
2032
+ },
2033
+ {
2034
+ "epoch": 1.73,
2035
+ "grad_norm": 1.0157365798950195,
2036
+ "learning_rate": 9.291810134014904e-06,
2037
+ "loss": 0.6758,
2038
+ "step": 2750
2039
+ },
2040
+ {
2041
+ "epoch": 1.73,
2042
+ "grad_norm": 1.0712261199951172,
2043
+ "learning_rate": 9.286739667911973e-06,
2044
+ "loss": 0.6645,
2045
+ "step": 2760
2046
+ },
2047
+ {
2048
+ "epoch": 1.74,
2049
+ "grad_norm": 1.5114517211914062,
2050
+ "learning_rate": 9.281652508570957e-06,
2051
+ "loss": 0.6968,
2052
+ "step": 2770
2053
+ },
2054
+ {
2055
+ "epoch": 1.74,
2056
+ "grad_norm": 1.3408139944076538,
2057
+ "learning_rate": 9.27654867580206e-06,
2058
+ "loss": 0.6718,
2059
+ "step": 2780
2060
+ },
2061
+ {
2062
+ "epoch": 1.75,
2063
+ "grad_norm": 1.3024680614471436,
2064
+ "learning_rate": 9.271428189480405e-06,
2065
+ "loss": 0.6915,
2066
+ "step": 2790
2067
+ },
2068
+ {
2069
+ "epoch": 1.76,
2070
+ "grad_norm": 1.3444178104400635,
2071
+ "learning_rate": 9.266291069545972e-06,
2072
+ "loss": 0.6821,
2073
+ "step": 2800
2074
+ },
2075
+ {
2076
+ "epoch": 1.76,
2077
+ "eval_loss": 0.6953641176223755,
2078
+ "eval_runtime": 65.0162,
2079
+ "eval_samples_per_second": 69.213,
2080
+ "eval_steps_per_second": 4.337,
2081
+ "step": 2800
2082
+ },
2083
+ {
2084
+ "epoch": 1.76,
2085
+ "grad_norm": 1.5429843664169312,
2086
+ "learning_rate": 9.261137336003511e-06,
2087
+ "loss": 0.666,
2088
+ "step": 2810
2089
+ },
2090
+ {
2091
+ "epoch": 1.77,
2092
+ "grad_norm": 1.143649697303772,
2093
+ "learning_rate": 9.255967008922475e-06,
2094
+ "loss": 0.6414,
2095
+ "step": 2820
2096
+ },
2097
+ {
2098
+ "epoch": 1.78,
2099
+ "grad_norm": 1.2989628314971924,
2100
+ "learning_rate": 9.250780108436926e-06,
2101
+ "loss": 0.7321,
2102
+ "step": 2830
2103
+ },
2104
+ {
2105
+ "epoch": 1.78,
2106
+ "grad_norm": 1.4191828966140747,
2107
+ "learning_rate": 9.245576654745471e-06,
2108
+ "loss": 0.735,
2109
+ "step": 2840
2110
+ },
2111
+ {
2112
+ "epoch": 1.79,
2113
+ "grad_norm": 1.5203850269317627,
2114
+ "learning_rate": 9.24035666811118e-06,
2115
+ "loss": 0.6809,
2116
+ "step": 2850
2117
+ },
2118
+ {
2119
+ "epoch": 1.79,
2120
+ "grad_norm": 1.3680098056793213,
2121
+ "learning_rate": 9.235120168861495e-06,
2122
+ "loss": 0.6378,
2123
+ "step": 2860
2124
+ },
2125
+ {
2126
+ "epoch": 1.8,
2127
+ "grad_norm": 1.346763014793396,
2128
+ "learning_rate": 9.229867177388172e-06,
2129
+ "loss": 0.6648,
2130
+ "step": 2870
2131
+ },
2132
+ {
2133
+ "epoch": 1.81,
2134
+ "grad_norm": 1.0098768472671509,
2135
+ "learning_rate": 9.224597714147186e-06,
2136
+ "loss": 0.6681,
2137
+ "step": 2880
2138
+ },
2139
+ {
2140
+ "epoch": 1.81,
2141
+ "grad_norm": 1.3174008131027222,
2142
+ "learning_rate": 9.219311799658652e-06,
2143
+ "loss": 0.6752,
2144
+ "step": 2890
2145
+ },
2146
+ {
2147
+ "epoch": 1.82,
2148
+ "grad_norm": 1.692084789276123,
2149
+ "learning_rate": 9.214009454506754e-06,
2150
+ "loss": 0.6427,
2151
+ "step": 2900
2152
+ },
2153
+ {
2154
+ "epoch": 1.83,
2155
+ "grad_norm": 1.3471505641937256,
2156
+ "learning_rate": 9.208690699339656e-06,
2157
+ "loss": 0.6763,
2158
+ "step": 2910
2159
+ },
2160
+ {
2161
+ "epoch": 1.83,
2162
+ "grad_norm": 1.202491283416748,
2163
+ "learning_rate": 9.203355554869428e-06,
2164
+ "loss": 0.6935,
2165
+ "step": 2920
2166
+ },
2167
+ {
2168
+ "epoch": 1.84,
2169
+ "grad_norm": 1.7211599349975586,
2170
+ "learning_rate": 9.198004041871962e-06,
2171
+ "loss": 0.7012,
2172
+ "step": 2930
2173
+ },
2174
+ {
2175
+ "epoch": 1.84,
2176
+ "grad_norm": 1.336504578590393,
2177
+ "learning_rate": 9.192636181186887e-06,
2178
+ "loss": 0.6713,
2179
+ "step": 2940
2180
+ },
2181
+ {
2182
+ "epoch": 1.85,
2183
+ "grad_norm": 1.2259244918823242,
2184
+ "learning_rate": 9.1872519937175e-06,
2185
+ "loss": 0.6344,
2186
+ "step": 2950
2187
+ },
2188
+ {
2189
+ "epoch": 1.86,
2190
+ "grad_norm": 1.3948123455047607,
2191
+ "learning_rate": 9.181851500430672e-06,
2192
+ "loss": 0.6699,
2193
+ "step": 2960
2194
+ },
2195
+ {
2196
+ "epoch": 1.86,
2197
+ "grad_norm": 1.2859784364700317,
2198
+ "learning_rate": 9.176434722356772e-06,
2199
+ "loss": 0.7029,
2200
+ "step": 2970
2201
+ },
2202
+ {
2203
+ "epoch": 1.87,
2204
+ "grad_norm": 1.1549146175384521,
2205
+ "learning_rate": 9.17100168058959e-06,
2206
+ "loss": 0.6491,
2207
+ "step": 2980
2208
+ },
2209
+ {
2210
+ "epoch": 1.88,
2211
+ "grad_norm": 1.575208306312561,
2212
+ "learning_rate": 9.165552396286236e-06,
2213
+ "loss": 0.6722,
2214
+ "step": 2990
2215
+ },
2216
+ {
2217
+ "epoch": 1.88,
2218
+ "grad_norm": 1.6159918308258057,
2219
+ "learning_rate": 9.160086890667086e-06,
2220
+ "loss": 0.7104,
2221
+ "step": 3000
2222
+ },
2223
+ {
2224
+ "epoch": 1.88,
2225
+ "eval_loss": 0.6943792104721069,
2226
+ "eval_runtime": 65.034,
2227
+ "eval_samples_per_second": 69.195,
2228
+ "eval_steps_per_second": 4.336,
2229
+ "step": 3000
2230
+ },
2231
+ {
2232
+ "epoch": 1.89,
2233
+ "grad_norm": 1.247308611869812,
2234
+ "learning_rate": 9.154605185015678e-06,
2235
+ "loss": 0.7042,
2236
+ "step": 3010
2237
+ },
2238
+ {
2239
+ "epoch": 1.89,
2240
+ "grad_norm": 1.540644884109497,
2241
+ "learning_rate": 9.14910730067863e-06,
2242
+ "loss": 0.6208,
2243
+ "step": 3020
2244
+ },
2245
+ {
2246
+ "epoch": 1.9,
2247
+ "grad_norm": 1.4479825496673584,
2248
+ "learning_rate": 9.143593259065573e-06,
2249
+ "loss": 0.6721,
2250
+ "step": 3030
2251
+ },
2252
+ {
2253
+ "epoch": 1.91,
2254
+ "grad_norm": 1.5486655235290527,
2255
+ "learning_rate": 9.138063081649052e-06,
2256
+ "loss": 0.6328,
2257
+ "step": 3040
2258
+ },
2259
+ {
2260
+ "epoch": 1.91,
2261
+ "grad_norm": 1.0703155994415283,
2262
+ "learning_rate": 9.132516789964443e-06,
2263
+ "loss": 0.6564,
2264
+ "step": 3050
2265
+ },
2266
+ {
2267
+ "epoch": 1.92,
2268
+ "grad_norm": 1.2725510597229004,
2269
+ "learning_rate": 9.126954405609882e-06,
2270
+ "loss": 0.6782,
2271
+ "step": 3060
2272
+ },
2273
+ {
2274
+ "epoch": 1.93,
2275
+ "grad_norm": 1.328399419784546,
2276
+ "learning_rate": 9.121375950246165e-06,
2277
+ "loss": 0.6686,
2278
+ "step": 3070
2279
+ },
2280
+ {
2281
+ "epoch": 1.93,
2282
+ "grad_norm": 1.2014747858047485,
2283
+ "learning_rate": 9.115781445596676e-06,
2284
+ "loss": 0.6445,
2285
+ "step": 3080
2286
+ },
2287
+ {
2288
+ "epoch": 1.94,
2289
+ "grad_norm": 1.3578124046325684,
2290
+ "learning_rate": 9.110170913447294e-06,
2291
+ "loss": 0.6306,
2292
+ "step": 3090
2293
+ },
2294
+ {
2295
+ "epoch": 1.94,
2296
+ "grad_norm": 1.3624286651611328,
2297
+ "learning_rate": 9.104544375646314e-06,
2298
+ "loss": 0.6465,
2299
+ "step": 3100
2300
+ },
2301
+ {
2302
+ "epoch": 1.95,
2303
+ "grad_norm": 1.709974765777588,
2304
+ "learning_rate": 9.098901854104359e-06,
2305
+ "loss": 0.6985,
2306
+ "step": 3110
2307
+ },
2308
+ {
2309
+ "epoch": 1.96,
2310
+ "grad_norm": 1.3302754163742065,
2311
+ "learning_rate": 9.09324337079429e-06,
2312
+ "loss": 0.7272,
2313
+ "step": 3120
2314
+ },
2315
+ {
2316
+ "epoch": 1.96,
2317
+ "grad_norm": 1.2946594953536987,
2318
+ "learning_rate": 9.08756894775114e-06,
2319
+ "loss": 0.6632,
2320
+ "step": 3130
2321
+ },
2322
+ {
2323
+ "epoch": 1.97,
2324
+ "grad_norm": 1.2699226140975952,
2325
+ "learning_rate": 9.081878607071996e-06,
2326
+ "loss": 0.6996,
2327
+ "step": 3140
2328
+ },
2329
+ {
2330
+ "epoch": 1.98,
2331
+ "grad_norm": 1.4561275243759155,
2332
+ "learning_rate": 9.076172370915944e-06,
2333
+ "loss": 0.734,
2334
+ "step": 3150
2335
+ },
2336
+ {
2337
+ "epoch": 1.98,
2338
+ "grad_norm": 1.4393534660339355,
2339
+ "learning_rate": 9.07045026150396e-06,
2340
+ "loss": 0.6578,
2341
+ "step": 3160
2342
+ },
2343
+ {
2344
+ "epoch": 1.99,
2345
+ "grad_norm": 1.4745630025863647,
2346
+ "learning_rate": 9.064712301118842e-06,
2347
+ "loss": 0.6527,
2348
+ "step": 3170
2349
+ },
2350
+ {
2351
+ "epoch": 1.99,
2352
+ "grad_norm": 1.1444178819656372,
2353
+ "learning_rate": 9.058958512105104e-06,
2354
+ "loss": 0.6487,
2355
+ "step": 3180
2356
+ },
2357
+ {
2358
+ "epoch": 2.0,
2359
+ "grad_norm": 1.433406114578247,
2360
+ "learning_rate": 9.053188916868912e-06,
2361
+ "loss": 0.7011,
2362
+ "step": 3190
2363
+ },
2364
+ {
2365
+ "epoch": 2.01,
2366
+ "grad_norm": 1.218345046043396,
2367
+ "learning_rate": 9.04740353787797e-06,
2368
+ "loss": 0.6222,
2369
+ "step": 3200
2370
+ },
2371
+ {
2372
+ "epoch": 2.01,
2373
+ "eval_loss": 0.693417489528656,
2374
+ "eval_runtime": 65.021,
2375
+ "eval_samples_per_second": 69.208,
2376
+ "eval_steps_per_second": 4.337,
2377
+ "step": 3200
2378
+ },
2379
+ {
2380
+ "epoch": 2.01,
2381
+ "grad_norm": 1.5473078489303589,
2382
+ "learning_rate": 9.041602397661459e-06,
2383
+ "loss": 0.6396,
2384
+ "step": 3210
2385
+ },
2386
+ {
2387
+ "epoch": 2.02,
2388
+ "grad_norm": 1.3116644620895386,
2389
+ "learning_rate": 9.035785518809928e-06,
2390
+ "loss": 0.6582,
2391
+ "step": 3220
2392
+ },
2393
+ {
2394
+ "epoch": 2.03,
2395
+ "grad_norm": 1.7744321823120117,
2396
+ "learning_rate": 9.029952923975217e-06,
2397
+ "loss": 0.6517,
2398
+ "step": 3230
2399
+ },
2400
+ {
2401
+ "epoch": 2.03,
2402
+ "grad_norm": 1.5516449213027954,
2403
+ "learning_rate": 9.024104635870368e-06,
2404
+ "loss": 0.6465,
2405
+ "step": 3240
2406
+ },
2407
+ {
2408
+ "epoch": 2.04,
2409
+ "grad_norm": 1.4612600803375244,
2410
+ "learning_rate": 9.018240677269532e-06,
2411
+ "loss": 0.6215,
2412
+ "step": 3250
2413
+ },
2414
+ {
2415
+ "epoch": 2.05,
2416
+ "grad_norm": 1.7983644008636475,
2417
+ "learning_rate": 9.012361071007892e-06,
2418
+ "loss": 0.6609,
2419
+ "step": 3260
2420
+ },
2421
+ {
2422
+ "epoch": 2.05,
2423
+ "grad_norm": 1.6382901668548584,
2424
+ "learning_rate": 9.00646583998155e-06,
2425
+ "loss": 0.6608,
2426
+ "step": 3270
2427
+ },
2428
+ {
2429
+ "epoch": 2.06,
2430
+ "grad_norm": 1.6763097047805786,
2431
+ "learning_rate": 9.000555007147469e-06,
2432
+ "loss": 0.6222,
2433
+ "step": 3280
2434
+ },
2435
+ {
2436
+ "epoch": 2.06,
2437
+ "grad_norm": 1.3221015930175781,
2438
+ "learning_rate": 8.994628595523358e-06,
2439
+ "loss": 0.6363,
2440
+ "step": 3290
2441
+ },
2442
+ {
2443
+ "epoch": 2.07,
2444
+ "grad_norm": 1.5837445259094238,
2445
+ "learning_rate": 8.988686628187597e-06,
2446
+ "loss": 0.6364,
2447
+ "step": 3300
2448
+ },
2449
+ {
2450
+ "epoch": 2.08,
2451
+ "grad_norm": 1.4271923303604126,
2452
+ "learning_rate": 8.98272912827914e-06,
2453
+ "loss": 0.6211,
2454
+ "step": 3310
2455
+ },
2456
+ {
2457
+ "epoch": 2.08,
2458
+ "grad_norm": 1.599827527999878,
2459
+ "learning_rate": 8.97675611899743e-06,
2460
+ "loss": 0.6326,
2461
+ "step": 3320
2462
+ },
2463
+ {
2464
+ "epoch": 2.09,
2465
+ "grad_norm": 1.6661384105682373,
2466
+ "learning_rate": 8.970767623602299e-06,
2467
+ "loss": 0.7006,
2468
+ "step": 3330
2469
+ },
2470
+ {
2471
+ "epoch": 2.1,
2472
+ "grad_norm": 1.9200857877731323,
2473
+ "learning_rate": 8.964763665413894e-06,
2474
+ "loss": 0.6316,
2475
+ "step": 3340
2476
+ },
2477
+ {
2478
+ "epoch": 2.1,
2479
+ "grad_norm": 1.8148436546325684,
2480
+ "learning_rate": 8.95874426781257e-06,
2481
+ "loss": 0.6318,
2482
+ "step": 3350
2483
+ },
2484
+ {
2485
+ "epoch": 2.11,
2486
+ "grad_norm": 1.3358807563781738,
2487
+ "learning_rate": 8.952709454238809e-06,
2488
+ "loss": 0.6067,
2489
+ "step": 3360
2490
+ },
2491
+ {
2492
+ "epoch": 2.11,
2493
+ "grad_norm": 1.8055490255355835,
2494
+ "learning_rate": 8.946659248193122e-06,
2495
+ "loss": 0.6289,
2496
+ "step": 3370
2497
+ },
2498
+ {
2499
+ "epoch": 2.12,
2500
+ "grad_norm": 1.4589310884475708,
2501
+ "learning_rate": 8.940593673235962e-06,
2502
+ "loss": 0.6537,
2503
+ "step": 3380
2504
+ },
2505
+ {
2506
+ "epoch": 2.13,
2507
+ "grad_norm": 1.903086543083191,
2508
+ "learning_rate": 8.934512752987635e-06,
2509
+ "loss": 0.6986,
2510
+ "step": 3390
2511
+ },
2512
+ {
2513
+ "epoch": 2.13,
2514
+ "grad_norm": 1.722476840019226,
2515
+ "learning_rate": 8.928416511128194e-06,
2516
+ "loss": 0.6383,
2517
+ "step": 3400
2518
+ },
2519
+ {
2520
+ "epoch": 2.13,
2521
+ "eval_loss": 0.6974382996559143,
2522
+ "eval_runtime": 65.0527,
2523
+ "eval_samples_per_second": 69.175,
2524
+ "eval_steps_per_second": 4.335,
2525
+ "step": 3400
2526
+ },
2527
+ {
2528
+ "epoch": 2.14,
2529
+ "grad_norm": 1.8206970691680908,
2530
+ "learning_rate": 8.922304971397369e-06,
2531
+ "loss": 0.6447,
2532
+ "step": 3410
2533
+ },
2534
+ {
2535
+ "epoch": 2.15,
2536
+ "grad_norm": 1.690631628036499,
2537
+ "learning_rate": 8.916178157594453e-06,
2538
+ "loss": 0.6441,
2539
+ "step": 3420
2540
+ },
2541
+ {
2542
+ "epoch": 2.15,
2543
+ "grad_norm": 2.108876943588257,
2544
+ "learning_rate": 8.910036093578223e-06,
2545
+ "loss": 0.6453,
2546
+ "step": 3430
2547
+ },
2548
+ {
2549
+ "epoch": 2.16,
2550
+ "grad_norm": 1.6356040239334106,
2551
+ "learning_rate": 8.90387880326684e-06,
2552
+ "loss": 0.6256,
2553
+ "step": 3440
2554
+ },
2555
+ {
2556
+ "epoch": 2.16,
2557
+ "grad_norm": 1.3783752918243408,
2558
+ "learning_rate": 8.897706310637766e-06,
2559
+ "loss": 0.6244,
2560
+ "step": 3450
2561
+ },
2562
+ {
2563
+ "epoch": 2.17,
2564
+ "grad_norm": 1.901208758354187,
2565
+ "learning_rate": 8.89151863972765e-06,
2566
+ "loss": 0.5975,
2567
+ "step": 3460
2568
+ },
2569
+ {
2570
+ "epoch": 2.18,
2571
+ "grad_norm": 1.595009446144104,
2572
+ "learning_rate": 8.88531581463226e-06,
2573
+ "loss": 0.6777,
2574
+ "step": 3470
2575
+ },
2576
+ {
2577
+ "epoch": 2.18,
2578
+ "grad_norm": 1.6791367530822754,
2579
+ "learning_rate": 8.879097859506371e-06,
2580
+ "loss": 0.6139,
2581
+ "step": 3480
2582
+ },
2583
+ {
2584
+ "epoch": 2.19,
2585
+ "grad_norm": 1.5569841861724854,
2586
+ "learning_rate": 8.872864798563676e-06,
2587
+ "loss": 0.5775,
2588
+ "step": 3490
2589
+ },
2590
+ {
2591
+ "epoch": 2.2,
2592
+ "grad_norm": 1.3804527521133423,
2593
+ "learning_rate": 8.866616656076696e-06,
2594
+ "loss": 0.624,
2595
+ "step": 3500
2596
+ },
2597
+ {
2598
+ "epoch": 2.2,
2599
+ "grad_norm": 1.7905445098876953,
2600
+ "learning_rate": 8.860353456376679e-06,
2601
+ "loss": 0.6647,
2602
+ "step": 3510
2603
+ },
2604
+ {
2605
+ "epoch": 2.21,
2606
+ "grad_norm": 1.802614688873291,
2607
+ "learning_rate": 8.854075223853509e-06,
2608
+ "loss": 0.7081,
2609
+ "step": 3520
2610
+ },
2611
+ {
2612
+ "epoch": 2.21,
2613
+ "grad_norm": 1.7135951519012451,
2614
+ "learning_rate": 8.847781982955613e-06,
2615
+ "loss": 0.6974,
2616
+ "step": 3530
2617
+ },
2618
+ {
2619
+ "epoch": 2.22,
2620
+ "grad_norm": 1.8468406200408936,
2621
+ "learning_rate": 8.841473758189853e-06,
2622
+ "loss": 0.6585,
2623
+ "step": 3540
2624
+ },
2625
+ {
2626
+ "epoch": 2.23,
2627
+ "grad_norm": 1.7520302534103394,
2628
+ "learning_rate": 8.835150574121455e-06,
2629
+ "loss": 0.6116,
2630
+ "step": 3550
2631
+ },
2632
+ {
2633
+ "epoch": 2.23,
2634
+ "grad_norm": 1.862479329109192,
2635
+ "learning_rate": 8.828812455373891e-06,
2636
+ "loss": 0.6333,
2637
+ "step": 3560
2638
+ },
2639
+ {
2640
+ "epoch": 2.24,
2641
+ "grad_norm": 1.7767084836959839,
2642
+ "learning_rate": 8.82245942662879e-06,
2643
+ "loss": 0.6015,
2644
+ "step": 3570
2645
+ },
2646
+ {
2647
+ "epoch": 2.25,
2648
+ "grad_norm": 1.6162598133087158,
2649
+ "learning_rate": 8.816091512625845e-06,
2650
+ "loss": 0.6719,
2651
+ "step": 3580
2652
+ },
2653
+ {
2654
+ "epoch": 2.25,
2655
+ "grad_norm": 1.8923571109771729,
2656
+ "learning_rate": 8.80970873816271e-06,
2657
+ "loss": 0.6562,
2658
+ "step": 3590
2659
+ },
2660
+ {
2661
+ "epoch": 2.26,
2662
+ "grad_norm": 1.7792338132858276,
2663
+ "learning_rate": 8.803311128094918e-06,
2664
+ "loss": 0.6436,
2665
+ "step": 3600
2666
+ },
2667
+ {
2668
+ "epoch": 2.26,
2669
+ "eval_loss": 0.6980520486831665,
2670
+ "eval_runtime": 65.0239,
2671
+ "eval_samples_per_second": 69.205,
2672
+ "eval_steps_per_second": 4.337,
2673
+ "step": 3600
2674
+ },
2675
+ {
2676
+ "epoch": 2.26,
2677
+ "grad_norm": 1.819449543952942,
2678
+ "learning_rate": 8.796898707335766e-06,
2679
+ "loss": 0.6022,
2680
+ "step": 3610
2681
+ },
2682
+ {
2683
+ "epoch": 2.27,
2684
+ "grad_norm": 1.923462986946106,
2685
+ "learning_rate": 8.790471500856229e-06,
2686
+ "loss": 0.6124,
2687
+ "step": 3620
2688
+ },
2689
+ {
2690
+ "epoch": 2.28,
2691
+ "grad_norm": 2.0969247817993164,
2692
+ "learning_rate": 8.784029533684857e-06,
2693
+ "loss": 0.6209,
2694
+ "step": 3630
2695
+ },
2696
+ {
2697
+ "epoch": 2.28,
2698
+ "grad_norm": 2.014631509780884,
2699
+ "learning_rate": 8.777572830907685e-06,
2700
+ "loss": 0.6179,
2701
+ "step": 3640
2702
+ },
2703
+ {
2704
+ "epoch": 2.29,
2705
+ "grad_norm": 1.7940195798873901,
2706
+ "learning_rate": 8.771101417668127e-06,
2707
+ "loss": 0.6815,
2708
+ "step": 3650
2709
+ },
2710
+ {
2711
+ "epoch": 2.3,
2712
+ "grad_norm": 1.7244881391525269,
2713
+ "learning_rate": 8.764615319166885e-06,
2714
+ "loss": 0.5767,
2715
+ "step": 3660
2716
+ },
2717
+ {
2718
+ "epoch": 2.3,
2719
+ "grad_norm": 2.157749652862549,
2720
+ "learning_rate": 8.758114560661846e-06,
2721
+ "loss": 0.6281,
2722
+ "step": 3670
2723
+ },
2724
+ {
2725
+ "epoch": 2.31,
2726
+ "grad_norm": 1.818303108215332,
2727
+ "learning_rate": 8.751599167467985e-06,
2728
+ "loss": 0.6368,
2729
+ "step": 3680
2730
+ },
2731
+ {
2732
+ "epoch": 2.31,
2733
+ "grad_norm": 1.8076434135437012,
2734
+ "learning_rate": 8.745069164957265e-06,
2735
+ "loss": 0.6503,
2736
+ "step": 3690
2737
+ },
2738
+ {
2739
+ "epoch": 2.32,
2740
+ "grad_norm": 1.7755082845687866,
2741
+ "learning_rate": 8.738524578558547e-06,
2742
+ "loss": 0.6503,
2743
+ "step": 3700
2744
+ },
2745
+ {
2746
+ "epoch": 2.33,
2747
+ "grad_norm": 2.0641837120056152,
2748
+ "learning_rate": 8.731965433757474e-06,
2749
+ "loss": 0.6412,
2750
+ "step": 3710
2751
+ },
2752
+ {
2753
+ "epoch": 2.33,
2754
+ "grad_norm": 2.174612045288086,
2755
+ "learning_rate": 8.72539175609639e-06,
2756
+ "loss": 0.6283,
2757
+ "step": 3720
2758
+ },
2759
+ {
2760
+ "epoch": 2.34,
2761
+ "grad_norm": 1.905965805053711,
2762
+ "learning_rate": 8.718803571174229e-06,
2763
+ "loss": 0.6316,
2764
+ "step": 3730
2765
+ },
2766
+ {
2767
+ "epoch": 2.35,
2768
+ "grad_norm": 2.290787935256958,
2769
+ "learning_rate": 8.712200904646417e-06,
2770
+ "loss": 0.6337,
2771
+ "step": 3740
2772
+ },
2773
+ {
2774
+ "epoch": 2.35,
2775
+ "grad_norm": 1.7773081064224243,
2776
+ "learning_rate": 8.705583782224776e-06,
2777
+ "loss": 0.6683,
2778
+ "step": 3750
2779
+ },
2780
+ {
2781
+ "epoch": 2.36,
2782
+ "grad_norm": 1.7513020038604736,
2783
+ "learning_rate": 8.698952229677422e-06,
2784
+ "loss": 0.6538,
2785
+ "step": 3760
2786
+ },
2787
+ {
2788
+ "epoch": 2.37,
2789
+ "grad_norm": 1.8641185760498047,
2790
+ "learning_rate": 8.692306272828661e-06,
2791
+ "loss": 0.6179,
2792
+ "step": 3770
2793
+ },
2794
+ {
2795
+ "epoch": 2.37,
2796
+ "grad_norm": 2.4094667434692383,
2797
+ "learning_rate": 8.685645937558896e-06,
2798
+ "loss": 0.6436,
2799
+ "step": 3780
2800
+ },
2801
+ {
2802
+ "epoch": 2.38,
2803
+ "grad_norm": 2.295719623565674,
2804
+ "learning_rate": 8.678971249804517e-06,
2805
+ "loss": 0.6242,
2806
+ "step": 3790
2807
+ },
2808
+ {
2809
+ "epoch": 2.38,
2810
+ "grad_norm": 2.3604509830474854,
2811
+ "learning_rate": 8.67228223555781e-06,
2812
+ "loss": 0.6444,
2813
+ "step": 3800
2814
+ },
2815
+ {
2816
+ "epoch": 2.38,
2817
+ "eval_loss": 0.6968220472335815,
2818
+ "eval_runtime": 65.0257,
2819
+ "eval_samples_per_second": 69.203,
2820
+ "eval_steps_per_second": 4.337,
2821
+ "step": 3800
2822
+ },
2823
+ {
2824
+ "epoch": 2.39,
2825
+ "grad_norm": 1.799545407295227,
2826
+ "learning_rate": 8.665578920866844e-06,
2827
+ "loss": 0.6562,
2828
+ "step": 3810
2829
+ },
2830
+ {
2831
+ "epoch": 2.4,
2832
+ "grad_norm": 1.87678062915802,
2833
+ "learning_rate": 8.658861331835384e-06,
2834
+ "loss": 0.6776,
2835
+ "step": 3820
2836
+ },
2837
+ {
2838
+ "epoch": 2.4,
2839
+ "grad_norm": 1.9466888904571533,
2840
+ "learning_rate": 8.652129494622776e-06,
2841
+ "loss": 0.6245,
2842
+ "step": 3830
2843
+ },
2844
+ {
2845
+ "epoch": 2.41,
2846
+ "grad_norm": 1.9451625347137451,
2847
+ "learning_rate": 8.645383435443853e-06,
2848
+ "loss": 0.6692,
2849
+ "step": 3840
2850
+ },
2851
+ {
2852
+ "epoch": 2.42,
2853
+ "grad_norm": 1.9275856018066406,
2854
+ "learning_rate": 8.638623180568829e-06,
2855
+ "loss": 0.6314,
2856
+ "step": 3850
2857
+ },
2858
+ {
2859
+ "epoch": 2.42,
2860
+ "grad_norm": 1.7316443920135498,
2861
+ "learning_rate": 8.631848756323198e-06,
2862
+ "loss": 0.6289,
2863
+ "step": 3860
2864
+ },
2865
+ {
2866
+ "epoch": 2.43,
2867
+ "grad_norm": 2.5170657634735107,
2868
+ "learning_rate": 8.625060189087636e-06,
2869
+ "loss": 0.6367,
2870
+ "step": 3870
2871
+ },
2872
+ {
2873
+ "epoch": 2.43,
2874
+ "grad_norm": 2.2198870182037354,
2875
+ "learning_rate": 8.618257505297887e-06,
2876
+ "loss": 0.6262,
2877
+ "step": 3880
2878
+ },
2879
+ {
2880
+ "epoch": 2.44,
2881
+ "grad_norm": 1.9207948446273804,
2882
+ "learning_rate": 8.611440731444673e-06,
2883
+ "loss": 0.6598,
2884
+ "step": 3890
2885
+ },
2886
+ {
2887
+ "epoch": 2.45,
2888
+ "grad_norm": 2.0086634159088135,
2889
+ "learning_rate": 8.604609894073583e-06,
2890
+ "loss": 0.6465,
2891
+ "step": 3900
2892
+ },
2893
+ {
2894
+ "epoch": 2.45,
2895
+ "grad_norm": 2.0597639083862305,
2896
+ "learning_rate": 8.597765019784972e-06,
2897
+ "loss": 0.665,
2898
+ "step": 3910
2899
+ },
2900
+ {
2901
+ "epoch": 2.46,
2902
+ "grad_norm": 1.8585723638534546,
2903
+ "learning_rate": 8.590906135233854e-06,
2904
+ "loss": 0.6207,
2905
+ "step": 3920
2906
+ },
2907
+ {
2908
+ "epoch": 2.47,
2909
+ "grad_norm": 2.1862194538116455,
2910
+ "learning_rate": 8.584033267129807e-06,
2911
+ "loss": 0.6626,
2912
+ "step": 3930
2913
+ },
2914
+ {
2915
+ "epoch": 2.47,
2916
+ "grad_norm": 2.048553228378296,
2917
+ "learning_rate": 8.577146442236856e-06,
2918
+ "loss": 0.6141,
2919
+ "step": 3940
2920
+ },
2921
+ {
2922
+ "epoch": 2.48,
2923
+ "grad_norm": 2.2547719478607178,
2924
+ "learning_rate": 8.570245687373384e-06,
2925
+ "loss": 0.6651,
2926
+ "step": 3950
2927
+ },
2928
+ {
2929
+ "epoch": 2.48,
2930
+ "grad_norm": 1.9522244930267334,
2931
+ "learning_rate": 8.563331029412013e-06,
2932
+ "loss": 0.6725,
2933
+ "step": 3960
2934
+ },
2935
+ {
2936
+ "epoch": 2.49,
2937
+ "grad_norm": 1.7376751899719238,
2938
+ "learning_rate": 8.556402495279506e-06,
2939
+ "loss": 0.6066,
2940
+ "step": 3970
2941
+ },
2942
+ {
2943
+ "epoch": 2.5,
2944
+ "grad_norm": 1.900639295578003,
2945
+ "learning_rate": 8.549460111956665e-06,
2946
+ "loss": 0.6752,
2947
+ "step": 3980
2948
+ },
2949
+ {
2950
+ "epoch": 2.5,
2951
+ "grad_norm": 2.1750218868255615,
2952
+ "learning_rate": 8.542503906478224e-06,
2953
+ "loss": 0.6554,
2954
+ "step": 3990
2955
+ },
2956
+ {
2957
+ "epoch": 2.51,
2958
+ "grad_norm": 1.6551765203475952,
2959
+ "learning_rate": 8.535533905932739e-06,
2960
+ "loss": 0.6368,
2961
+ "step": 4000
2962
+ },
2963
+ {
2964
+ "epoch": 2.51,
2965
+ "eval_loss": 0.6986888647079468,
2966
+ "eval_runtime": 65.007,
2967
+ "eval_samples_per_second": 69.223,
2968
+ "eval_steps_per_second": 4.338,
2969
+ "step": 4000
2970
+ },
2971
+ {
2972
+ "epoch": 2.51,
2973
+ "step": 4000,
2974
+ "total_flos": 9.03392259225944e+17,
2975
+ "train_loss": 0.7081527805328369,
2976
+ "train_runtime": 4312.5386,
2977
+ "train_samples_per_second": 59.13,
2978
+ "train_steps_per_second": 3.696
2979
+ }
2980
+ ],
2981
+ "logging_steps": 10,
2982
+ "max_steps": 15940,
2983
+ "num_input_tokens_seen": 0,
2984
+ "num_train_epochs": 10,
2985
+ "save_steps": 1000,
2986
+ "total_flos": 9.03392259225944e+17,
2987
+ "train_batch_size": 8,
2988
+ "trial_name": null,
2989
+ "trial_params": null
2990
+ }
llama2_13b_peft/news_commentary_de/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:190c16756cb1d693a35940640a359e26ac9a5176a57e5fc3919e9b137d670ef2
3
+ size 5112
llama2_13b_peft/news_commentary_de/training_eval_loss.png ADDED
llama2_13b_peft/news_commentary_de/training_loss.png ADDED
llama2_13b_peft/news_commentary_it/README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - llama-factory
6
+ - lora
7
+ - generated_from_trainer
8
+ base_model: /data1/model/llama2/meta-llama/Llama2-13b
9
+ model-index:
10
+ - name: news_commentary_it_no_sys
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # news_commentary_it_no_sys
18
+
19
+ This model is a fine-tuned version of [/data1/model/llama2/meta-llama/Llama2-13b](https://huggingface.co//data1/model/llama2/meta-llama/Llama2-13b) on the news_commentary_it_no_sys dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.6415
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 5e-05
41
+ - train_batch_size: 4
42
+ - eval_batch_size: 4
43
+ - seed: 42
44
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
+ - lr_scheduler_type: cosine
46
+ - lr_scheduler_warmup_steps: 20
47
+ - num_epochs: 5.0
48
+
49
+ ### Training results
50
+
51
+ | Training Loss | Epoch | Step | Validation Loss |
52
+ |:-------------:|:------:|:-----:|:---------------:|
53
+ | 0.7236 | 0.2807 | 2000 | 0.6710 |
54
+ | 0.724 | 0.5614 | 4000 | 0.6521 |
55
+ | 0.6455 | 0.8421 | 6000 | 0.6415 |
56
+ | 0.5533 | 1.1228 | 8000 | 0.6548 |
57
+ | 0.5192 | 1.4035 | 10000 | 0.6501 |
58
+ | 0.4796 | 1.6842 | 12000 | 0.6500 |
59
+
60
+
61
+ ### Framework versions
62
+
63
+ - PEFT 0.10.0
64
+ - Transformers 4.40.0
65
+ - Pytorch 2.2.1
66
+ - Datasets 2.18.0
67
+ - Tokenizers 0.19.1
llama2_13b_peft/news_commentary_it/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/data1/model/llama2/meta-llama/Llama2-13b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "down_proj",
24
+ "up_proj",
25
+ "k_proj",
26
+ "gate_proj",
27
+ "q_proj",
28
+ "o_proj",
29
+ "v_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
llama2_13b_peft/news_commentary_it/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edef1dfcb7b9574ec3ab4820df092cc5ff382071e97048d16d7f69d7094885d3
3
+ size 125248064
llama2_13b_peft/news_commentary_it/all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.6842105263157894,
3
+ "eval_loss": 0.6415141820907593,
4
+ "eval_runtime": 119.5773,
5
+ "eval_samples_per_second": 12.544,
6
+ "eval_steps_per_second": 3.136,
7
+ "total_flos": 6.933368738955264e+17,
8
+ "train_loss": 0.6038338423768679,
9
+ "train_runtime": 5861.7175,
10
+ "train_samples_per_second": 24.31,
11
+ "train_steps_per_second": 6.078
12
+ }
llama2_13b_peft/news_commentary_it/eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.6842105263157894,
3
+ "eval_loss": 0.6415141820907593,
4
+ "eval_runtime": 119.5773,
5
+ "eval_samples_per_second": 12.544,
6
+ "eval_steps_per_second": 3.136
7
+ }
llama2_13b_peft/news_commentary_it/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
llama2_13b_peft/news_commentary_it/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
llama2_13b_peft/news_commentary_it/tokenizer_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "LlamaTokenizer",
43
+ "unk_token": "<unk>",
44
+ "use_default_system_prompt": false
45
+ }
llama2_13b_peft/news_commentary_it/train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.6842105263157894,
3
+ "total_flos": 6.933368738955264e+17,
4
+ "train_loss": 0.6038338423768679,
5
+ "train_runtime": 5861.7175,
6
+ "train_samples_per_second": 24.31,
7
+ "train_steps_per_second": 6.078
8
+ }
llama2_13b_peft/news_commentary_it/trainer_log.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llama2_13b_peft/news_commentary_it/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
llama2_13b_peft/news_commentary_it/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c760325c4e915ed6add2ca1fa0f2456628f1a65e9e53ca6ae7e92088e8ec81d2
3
+ size 5176
llama2_13b_peft/news_commentary_it/training_eval_loss.png ADDED
llama2_13b_peft/news_commentary_it/training_loss.png ADDED
llama2_13b_peft/topical_chat/README.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - llama-factory
6
+ - lora
7
+ - generated_from_trainer
8
+ base_model: /data1/model/llama2/meta-llama/Llama2-13b
9
+ model-index:
10
+ - name: topical_chat_no_sys
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # topical_chat_no_sys
18
+
19
+ This model is a fine-tuned version of [/data1/model/llama2/meta-llama/Llama2-13b](https://huggingface.co//data1/model/llama2/meta-llama/Llama2-13b) on the topical_chat_no_sys dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 1.8941
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 5e-05
41
+ - train_batch_size: 4
42
+ - eval_batch_size: 4
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 2
46
+ - total_train_batch_size: 8
47
+ - total_eval_batch_size: 8
48
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
+ - lr_scheduler_type: cosine
50
+ - lr_scheduler_warmup_steps: 20
51
+ - num_epochs: 5.0
52
+
53
+ ### Training results
54
+
55
+ | Training Loss | Epoch | Step | Validation Loss |
56
+ |:-------------:|:------:|:----:|:---------------:|
57
+ | 2.1904 | 0.0472 | 100 | 2.1137 |
58
+ | 1.9627 | 0.0944 | 200 | 2.0589 |
59
+ | 2.0172 | 0.1416 | 300 | 2.0221 |
60
+ | 1.8965 | 0.1889 | 400 | 1.9968 |
61
+ | 1.9534 | 0.2361 | 500 | 1.9823 |
62
+ | 1.8621 | 0.2833 | 600 | 1.9679 |
63
+ | 1.9777 | 0.3305 | 700 | 1.9611 |
64
+ | 2.0865 | 0.3777 | 800 | 1.9544 |
65
+ | 1.9662 | 0.4249 | 900 | 1.9461 |
66
+ | 1.8352 | 0.4721 | 1000 | 1.9376 |
67
+ | 1.8973 | 0.5194 | 1100 | 1.9329 |
68
+ | 1.9688 | 0.5666 | 1200 | 1.9264 |
69
+ | 1.8383 | 0.6138 | 1300 | 1.9192 |
70
+ | 1.9032 | 0.6610 | 1400 | 1.9146 |
71
+ | 1.9295 | 0.7082 | 1500 | 1.9109 |
72
+ | 1.8207 | 0.7554 | 1600 | 1.9061 |
73
+ | 1.9119 | 0.8026 | 1700 | 1.9032 |
74
+ | 1.8392 | 0.8499 | 1800 | 1.9019 |
75
+ | 1.961 | 0.8971 | 1900 | 1.8994 |
76
+ | 1.8913 | 0.9443 | 2000 | 1.8945 |
77
+ | 1.8187 | 0.9915 | 2100 | 1.8941 |
78
+ | 1.7296 | 1.0387 | 2200 | 1.9006 |
79
+ | 1.6184 | 1.0859 | 2300 | 1.9040 |
80
+ | 1.6973 | 1.1331 | 2400 | 1.9056 |
81
+
82
+
83
+ ### Framework versions
84
+
85
+ - PEFT 0.10.0
86
+ - Transformers 4.40.0
87
+ - Pytorch 2.2.1
88
+ - Datasets 2.18.0
89
+ - Tokenizers 0.19.1
llama2_13b_peft/topical_chat/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/data1/model/llama2/meta-llama/Llama2-13b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "v_proj",
24
+ "down_proj",
25
+ "q_proj",
26
+ "gate_proj",
27
+ "up_proj",
28
+ "k_proj",
29
+ "o_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
llama2_13b_peft/topical_chat/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e6e2ffd0565e16b426d000f5f8d1e22505be002104c4426830c2fe30127625f
3
+ size 125248064
llama2_13b_peft/topical_chat/all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.13314447592068,
3
+ "eval_loss": 1.8941270112991333,
4
+ "eval_runtime": 40.8751,
5
+ "eval_samples_per_second": 21.823,
6
+ "eval_steps_per_second": 2.74,
7
+ "total_flos": 9.512959383227597e+17,
8
+ "train_loss": 1.9100826263427735,
9
+ "train_runtime": 3885.2685,
10
+ "train_samples_per_second": 21.805,
11
+ "train_steps_per_second": 2.726
12
+ }
llama2_13b_peft/topical_chat/eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.13314447592068,
3
+ "eval_loss": 1.8941270112991333,
4
+ "eval_runtime": 40.8751,
5
+ "eval_samples_per_second": 21.823,
6
+ "eval_steps_per_second": 2.74
7
+ }
llama2_13b_peft/topical_chat/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
llama2_13b_peft/topical_chat/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
llama2_13b_peft/topical_chat/tokenizer_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "LlamaTokenizer",
43
+ "unk_token": "<unk>",
44
+ "use_default_system_prompt": false
45
+ }