hushell commited on
Commit
4325427
1 Parent(s): ba60cc9

Model save

Browse files
README.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: ondevicellm/tinyllama_mole_sft_ultrachat_ep3
3
+ tags:
4
+ - trl
5
+ - dpo
6
+ - generated_from_trainer
7
+ model-index:
8
+ - name: tinyllama_mole_dpo_ep3
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # tinyllama_mole_dpo_ep3
16
+
17
+ This model is a fine-tuned version of [ondevicellm/tinyllama_mole_sft_ultrachat_ep3](https://huggingface.co/ondevicellm/tinyllama_mole_sft_ultrachat_ep3) on the None dataset.
18
+ It achieves the following results on the evaluation set:
19
+ - Loss: 0.6285
20
+ - Rewards/chosen: -0.3050
21
+ - Rewards/rejected: -0.5353
22
+ - Rewards/accuracies: 0.6806
23
+ - Rewards/margins: 0.2302
24
+ - Logps/rejected: -354.2071
25
+ - Logps/chosen: -373.1399
26
+ - Logits/rejected: -1.6731
27
+ - Logits/chosen: -1.8041
28
+
29
+ ## Model description
30
+
31
+ More information needed
32
+
33
+ ## Intended uses & limitations
34
+
35
+ More information needed
36
+
37
+ ## Training and evaluation data
38
+
39
+ More information needed
40
+
41
+ ## Training procedure
42
+
43
+ ### Training hyperparameters
44
+
45
+ The following hyperparameters were used during training:
46
+ - learning_rate: 5e-07
47
+ - train_batch_size: 8
48
+ - eval_batch_size: 8
49
+ - seed: 42
50
+ - distributed_type: multi-GPU
51
+ - num_devices: 4
52
+ - gradient_accumulation_steps: 2
53
+ - total_train_batch_size: 64
54
+ - total_eval_batch_size: 32
55
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
56
+ - lr_scheduler_type: cosine
57
+ - lr_scheduler_warmup_steps: 100
58
+ - num_epochs: 1
59
+
60
+ ### Training results
61
+
62
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
63
+ |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
64
+ | 0.6896 | 0.1 | 100 | 0.6899 | 0.0064 | -0.0013 | 0.6448 | 0.0076 | -300.8089 | -342.0017 | -1.7574 | -1.8918 |
65
+ | 0.6762 | 0.21 | 200 | 0.6756 | -0.0293 | -0.0716 | 0.6627 | 0.0423 | -307.8423 | -345.5688 | -1.7501 | -1.8839 |
66
+ | 0.6499 | 0.31 | 300 | 0.6587 | -0.0875 | -0.1813 | 0.6687 | 0.0938 | -318.8118 | -351.3895 | -1.7358 | -1.8688 |
67
+ | 0.6374 | 0.42 | 400 | 0.6451 | -0.1726 | -0.3218 | 0.6746 | 0.1493 | -332.8632 | -359.8953 | -1.7164 | -1.8482 |
68
+ | 0.6348 | 0.52 | 500 | 0.6377 | -0.2696 | -0.4550 | 0.6647 | 0.1854 | -346.1808 | -369.6013 | -1.6884 | -1.8208 |
69
+ | 0.6308 | 0.63 | 600 | 0.6333 | -0.2783 | -0.4815 | 0.6726 | 0.2032 | -348.8291 | -370.4673 | -1.6965 | -1.8269 |
70
+ | 0.62 | 0.73 | 700 | 0.6312 | -0.2323 | -0.4505 | 0.6806 | 0.2182 | -345.7306 | -365.8656 | -1.6841 | -1.8149 |
71
+ | 0.6055 | 0.84 | 800 | 0.6287 | -0.2877 | -0.5169 | 0.6865 | 0.2292 | -352.3697 | -371.4099 | -1.6793 | -1.8099 |
72
+ | 0.6357 | 0.94 | 900 | 0.6285 | -0.3050 | -0.5353 | 0.6806 | 0.2302 | -354.2071 | -373.1399 | -1.6731 | -1.8041 |
73
+
74
+
75
+ ### Framework versions
76
+
77
+ - Transformers 4.37.0
78
+ - Pytorch 2.1.2+cu118
79
+ - Datasets 2.16.1
80
+ - Tokenizers 0.15.0
all_results.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_logits/chosen": -1.8041099309921265,
4
+ "eval_logits/rejected": -1.6730901002883911,
5
+ "eval_logps/chosen": -373.1398620605469,
6
+ "eval_logps/rejected": -354.2070617675781,
7
+ "eval_loss": 0.6285176873207092,
8
+ "eval_rewards/accuracies": 0.6805555820465088,
9
+ "eval_rewards/chosen": -0.30501797795295715,
10
+ "eval_rewards/margins": 0.23024281859397888,
11
+ "eval_rewards/rejected": -0.535260796546936,
12
+ "eval_runtime": 337.0045,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 5.935,
15
+ "eval_steps_per_second": 0.187,
16
+ "train_loss": 0.6456358959537526,
17
+ "train_runtime": 17140.5232,
18
+ "train_samples": 61135,
19
+ "train_samples_per_second": 3.567,
20
+ "train_steps_per_second": 0.056
21
+ }
eval_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_logits/chosen": -1.8041099309921265,
4
+ "eval_logits/rejected": -1.6730901002883911,
5
+ "eval_logps/chosen": -373.1398620605469,
6
+ "eval_logps/rejected": -354.2070617675781,
7
+ "eval_loss": 0.6285176873207092,
8
+ "eval_rewards/accuracies": 0.6805555820465088,
9
+ "eval_rewards/chosen": -0.30501797795295715,
10
+ "eval_rewards/margins": 0.23024281859397888,
11
+ "eval_rewards/rejected": -0.535260796546936,
12
+ "eval_runtime": 337.0045,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 5.935,
15
+ "eval_steps_per_second": 0.187
16
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.37.0",
6
+ "use_cache": false
7
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22598c3d55a92213b26aab9ebd3d36f0708966ccfd003498063bf4604f914a7d
3
  size 2223960880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31732c1d87bb760b754f5718db58cb66cb3423c26baf7a1f0855ab58a2d9d493
3
  size 2223960880
runs/Jan26_08-44-51_main1/events.out.tfevents.1706259102.main1.71884.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e9cfe5693673fb3ea8b8a5c66cde4688aff388ed7caf43b1447596f471cc057
3
- size 62350
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:477da43a49b5ad27fe6bf8a92038c819502ebab2ee0ae51907ecae0421544580
3
+ size 72954
runs/Jan26_08-44-51_main1/events.out.tfevents.1706276579.main1.71884.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff51a67ac05583c8159b204d729a64cd493901238fa240da1c0a36688ca5e16b
3
+ size 828
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.6456358959537526,
4
+ "train_runtime": 17140.5232,
5
+ "train_samples": 61135,
6
+ "train_samples_per_second": 3.567,
7
+ "train_steps_per_second": 0.056
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6285176873207092,
3
+ "best_model_checkpoint": "data/tinyllama_mole_dpo_ep3/checkpoint-900",
4
+ "epoch": 0.9994767137624281,
5
+ "eval_steps": 100,
6
+ "global_step": 955,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 5e-09,
14
+ "logits/chosen": -1.790444016456604,
15
+ "logits/rejected": -1.7375602722167969,
16
+ "logps/chosen": -288.11163330078125,
17
+ "logps/rejected": -270.18121337890625,
18
+ "loss": 0.6931,
19
+ "rewards/accuracies": 0.0,
20
+ "rewards/chosen": 0.0,
21
+ "rewards/margins": 0.0,
22
+ "rewards/rejected": 0.0,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 0.01,
27
+ "learning_rate": 5e-08,
28
+ "logits/chosen": -1.9029791355133057,
29
+ "logits/rejected": -1.6527897119522095,
30
+ "logps/chosen": -349.9918518066406,
31
+ "logps/rejected": -281.4112243652344,
32
+ "loss": 0.693,
33
+ "rewards/accuracies": 0.5069444179534912,
34
+ "rewards/chosen": 0.0003158848558086902,
35
+ "rewards/margins": 0.0004107448039576411,
36
+ "rewards/rejected": -9.485996997682378e-05,
37
+ "step": 10
38
+ },
39
+ {
40
+ "epoch": 0.02,
41
+ "learning_rate": 1e-07,
42
+ "logits/chosen": -1.773256540298462,
43
+ "logits/rejected": -1.7059326171875,
44
+ "logps/chosen": -306.29168701171875,
45
+ "logps/rejected": -274.650634765625,
46
+ "loss": 0.693,
47
+ "rewards/accuracies": 0.5,
48
+ "rewards/chosen": 0.0002991842629853636,
49
+ "rewards/margins": 0.00016614615742582828,
50
+ "rewards/rejected": 0.000133038149215281,
51
+ "step": 20
52
+ },
53
+ {
54
+ "epoch": 0.03,
55
+ "learning_rate": 1.5e-07,
56
+ "logits/chosen": -1.8527847528457642,
57
+ "logits/rejected": -1.707421898841858,
58
+ "logps/chosen": -339.32342529296875,
59
+ "logps/rejected": -297.5170593261719,
60
+ "loss": 0.693,
61
+ "rewards/accuracies": 0.512499988079071,
62
+ "rewards/chosen": -0.0001883753720903769,
63
+ "rewards/margins": -0.00028402512543834746,
64
+ "rewards/rejected": 9.56498843152076e-05,
65
+ "step": 30
66
+ },
67
+ {
68
+ "epoch": 0.04,
69
+ "learning_rate": 2e-07,
70
+ "logits/chosen": -1.869114875793457,
71
+ "logits/rejected": -1.7149085998535156,
72
+ "logps/chosen": -329.600830078125,
73
+ "logps/rejected": -274.0981140136719,
74
+ "loss": 0.693,
75
+ "rewards/accuracies": 0.4937500059604645,
76
+ "rewards/chosen": 0.0005581147270277143,
77
+ "rewards/margins": 0.000760135124437511,
78
+ "rewards/rejected": -0.00020202035375405103,
79
+ "step": 40
80
+ },
81
+ {
82
+ "epoch": 0.05,
83
+ "learning_rate": 2.5e-07,
84
+ "logits/chosen": -1.8775875568389893,
85
+ "logits/rejected": -1.7631309032440186,
86
+ "logps/chosen": -344.46002197265625,
87
+ "logps/rejected": -307.79693603515625,
88
+ "loss": 0.6928,
89
+ "rewards/accuracies": 0.5,
90
+ "rewards/chosen": 0.0002629683876875788,
91
+ "rewards/margins": 0.0003627274709288031,
92
+ "rewards/rejected": -9.97590395854786e-05,
93
+ "step": 50
94
+ },
95
+ {
96
+ "epoch": 0.06,
97
+ "learning_rate": 3e-07,
98
+ "logits/chosen": -1.8707891702651978,
99
+ "logits/rejected": -1.711287498474121,
100
+ "logps/chosen": -345.78326416015625,
101
+ "logps/rejected": -318.0264892578125,
102
+ "loss": 0.6926,
103
+ "rewards/accuracies": 0.574999988079071,
104
+ "rewards/chosen": 0.0013746457407251,
105
+ "rewards/margins": 0.0012839403934776783,
106
+ "rewards/rejected": 9.070520900422707e-05,
107
+ "step": 60
108
+ },
109
+ {
110
+ "epoch": 0.07,
111
+ "learning_rate": 3.5e-07,
112
+ "logits/chosen": -1.997075080871582,
113
+ "logits/rejected": -1.8358447551727295,
114
+ "logps/chosen": -379.5538024902344,
115
+ "logps/rejected": -332.5546875,
116
+ "loss": 0.6926,
117
+ "rewards/accuracies": 0.612500011920929,
118
+ "rewards/chosen": 0.001883229473605752,
119
+ "rewards/margins": 0.0018280971562489867,
120
+ "rewards/rejected": 5.51322809769772e-05,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.08,
125
+ "learning_rate": 4e-07,
126
+ "logits/chosen": -1.7132848501205444,
127
+ "logits/rejected": -1.5544992685317993,
128
+ "logps/chosen": -352.10699462890625,
129
+ "logps/rejected": -292.5279846191406,
130
+ "loss": 0.6914,
131
+ "rewards/accuracies": 0.6812499761581421,
132
+ "rewards/chosen": 0.0026874816976487637,
133
+ "rewards/margins": 0.0036670640110969543,
134
+ "rewards/rejected": -0.000979582779109478,
135
+ "step": 80
136
+ },
137
+ {
138
+ "epoch": 0.09,
139
+ "learning_rate": 4.5e-07,
140
+ "logits/chosen": -1.9277913570404053,
141
+ "logits/rejected": -1.8120664358139038,
142
+ "logps/chosen": -345.2822570800781,
143
+ "logps/rejected": -309.4122314453125,
144
+ "loss": 0.6909,
145
+ "rewards/accuracies": 0.5562499761581421,
146
+ "rewards/chosen": 0.00368087668903172,
147
+ "rewards/margins": 0.004829054698348045,
148
+ "rewards/rejected": -0.0011481784749776125,
149
+ "step": 90
150
+ },
151
+ {
152
+ "epoch": 0.1,
153
+ "learning_rate": 5e-07,
154
+ "logits/chosen": -1.9077354669570923,
155
+ "logits/rejected": -1.8217405080795288,
156
+ "logps/chosen": -364.0276184082031,
157
+ "logps/rejected": -340.09906005859375,
158
+ "loss": 0.6896,
159
+ "rewards/accuracies": 0.699999988079071,
160
+ "rewards/chosen": 0.00508699519559741,
161
+ "rewards/margins": 0.007315085269510746,
162
+ "rewards/rejected": -0.002228089142590761,
163
+ "step": 100
164
+ },
165
+ {
166
+ "epoch": 0.1,
167
+ "eval_logits/chosen": -1.8918097019195557,
168
+ "eval_logits/rejected": -1.7573894262313843,
169
+ "eval_logps/chosen": -342.0017395019531,
170
+ "eval_logps/rejected": -300.80889892578125,
171
+ "eval_loss": 0.689919114112854,
172
+ "eval_rewards/accuracies": 0.6448412537574768,
173
+ "eval_rewards/chosen": 0.006363342050462961,
174
+ "eval_rewards/margins": 0.007642398122698069,
175
+ "eval_rewards/rejected": -0.0012790567707270384,
176
+ "eval_runtime": 339.8787,
177
+ "eval_samples_per_second": 5.884,
178
+ "eval_steps_per_second": 0.185,
179
+ "step": 100
180
+ },
181
+ {
182
+ "epoch": 0.12,
183
+ "learning_rate": 4.998312558730158e-07,
184
+ "logits/chosen": -1.8893086910247803,
185
+ "logits/rejected": -1.6524394750595093,
186
+ "logps/chosen": -312.1143493652344,
187
+ "logps/rejected": -249.64523315429688,
188
+ "loss": 0.6887,
189
+ "rewards/accuracies": 0.643750011920929,
190
+ "rewards/chosen": 0.005943284835666418,
191
+ "rewards/margins": 0.008504782803356647,
192
+ "rewards/rejected": -0.002561498200520873,
193
+ "step": 110
194
+ },
195
+ {
196
+ "epoch": 0.13,
197
+ "learning_rate": 4.993252512887069e-07,
198
+ "logits/chosen": -1.8319896459579468,
199
+ "logits/rejected": -1.6496975421905518,
200
+ "logps/chosen": -358.3703308105469,
201
+ "logps/rejected": -289.1592102050781,
202
+ "loss": 0.6873,
203
+ "rewards/accuracies": 0.6625000238418579,
204
+ "rewards/chosen": 0.009710123762488365,
205
+ "rewards/margins": 0.017875777557492256,
206
+ "rewards/rejected": -0.008165654726326466,
207
+ "step": 120
208
+ },
209
+ {
210
+ "epoch": 0.14,
211
+ "learning_rate": 4.984826693294873e-07,
212
+ "logits/chosen": -1.9560505151748657,
213
+ "logits/rejected": -1.7518360614776611,
214
+ "logps/chosen": -377.28082275390625,
215
+ "logps/rejected": -308.9075927734375,
216
+ "loss": 0.6866,
217
+ "rewards/accuracies": 0.612500011920929,
218
+ "rewards/chosen": 0.009400544688105583,
219
+ "rewards/margins": 0.014766154810786247,
220
+ "rewards/rejected": -0.005365610122680664,
221
+ "step": 130
222
+ },
223
+ {
224
+ "epoch": 0.15,
225
+ "learning_rate": 4.973046474414144e-07,
226
+ "logits/chosen": -1.7957839965820312,
227
+ "logits/rejected": -1.6959806680679321,
228
+ "logps/chosen": -283.5320129394531,
229
+ "logps/rejected": -259.69097900390625,
230
+ "loss": 0.6851,
231
+ "rewards/accuracies": 0.6312500238418579,
232
+ "rewards/chosen": 0.00773229356855154,
233
+ "rewards/margins": 0.01577392965555191,
234
+ "rewards/rejected": -0.00804163608700037,
235
+ "step": 140
236
+ },
237
+ {
238
+ "epoch": 0.16,
239
+ "learning_rate": 4.957927758986888e-07,
240
+ "logits/chosen": -1.8573449850082397,
241
+ "logits/rejected": -1.7665218114852905,
242
+ "logps/chosen": -333.4697265625,
243
+ "logps/rejected": -334.0047912597656,
244
+ "loss": 0.6864,
245
+ "rewards/accuracies": 0.625,
246
+ "rewards/chosen": -0.00011548856127774343,
247
+ "rewards/margins": 0.005317127797752619,
248
+ "rewards/rejected": -0.005432615987956524,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.17,
253
+ "learning_rate": 4.939490956568589e-07,
254
+ "logits/chosen": -1.9507248401641846,
255
+ "logits/rejected": -1.7802870273590088,
256
+ "logps/chosen": -352.24407958984375,
257
+ "logps/rejected": -330.3514099121094,
258
+ "loss": 0.6815,
259
+ "rewards/accuracies": 0.668749988079071,
260
+ "rewards/chosen": 0.0028652914334088564,
261
+ "rewards/margins": 0.018842989578843117,
262
+ "rewards/rejected": -0.01597769930958748,
263
+ "step": 160
264
+ },
265
+ {
266
+ "epoch": 0.18,
267
+ "learning_rate": 4.917760955976277e-07,
268
+ "logits/chosen": -1.9207957983016968,
269
+ "logits/rejected": -1.7402126789093018,
270
+ "logps/chosen": -322.8128662109375,
271
+ "logps/rejected": -280.6071472167969,
272
+ "loss": 0.6807,
273
+ "rewards/accuracies": 0.65625,
274
+ "rewards/chosen": 0.006607011891901493,
275
+ "rewards/margins": 0.029789209365844727,
276
+ "rewards/rejected": -0.023182198405265808,
277
+ "step": 170
278
+ },
279
+ {
280
+ "epoch": 0.19,
281
+ "learning_rate": 4.892767091689785e-07,
282
+ "logits/chosen": -1.8039575815200806,
283
+ "logits/rejected": -1.6338218450546265,
284
+ "logps/chosen": -298.98541259765625,
285
+ "logps/rejected": -268.31658935546875,
286
+ "loss": 0.6781,
287
+ "rewards/accuracies": 0.668749988079071,
288
+ "rewards/chosen": -0.010788476094603539,
289
+ "rewards/margins": 0.033482082188129425,
290
+ "rewards/rejected": -0.044270556420087814,
291
+ "step": 180
292
+ },
293
+ {
294
+ "epoch": 0.2,
295
+ "learning_rate": 4.864543104251586e-07,
296
+ "logits/chosen": -1.9226608276367188,
297
+ "logits/rejected": -1.7317349910736084,
298
+ "logps/chosen": -342.0778503417969,
299
+ "logps/rejected": -300.3584899902344,
300
+ "loss": 0.6758,
301
+ "rewards/accuracies": 0.675000011920929,
302
+ "rewards/chosen": -0.010154007002711296,
303
+ "rewards/margins": 0.03691417723894119,
304
+ "rewards/rejected": -0.04706818610429764,
305
+ "step": 190
306
+ },
307
+ {
308
+ "epoch": 0.21,
309
+ "learning_rate": 4.833127094718643e-07,
310
+ "logits/chosen": -1.7831227779388428,
311
+ "logits/rejected": -1.6590824127197266,
312
+ "logps/chosen": -343.6825866699219,
313
+ "logps/rejected": -319.09796142578125,
314
+ "loss": 0.6762,
315
+ "rewards/accuracies": 0.6812499761581421,
316
+ "rewards/chosen": -0.012453018687665462,
317
+ "rewards/margins": 0.04117157310247421,
318
+ "rewards/rejected": -0.05362458899617195,
319
+ "step": 200
320
+ },
321
+ {
322
+ "epoch": 0.21,
323
+ "eval_logits/chosen": -1.883855938911438,
324
+ "eval_logits/rejected": -1.7501325607299805,
325
+ "eval_logps/chosen": -345.56878662109375,
326
+ "eval_logps/rejected": -307.84234619140625,
327
+ "eval_loss": 0.6756463646888733,
328
+ "eval_rewards/accuracies": 0.6626983880996704,
329
+ "eval_rewards/chosen": -0.029307426884770393,
330
+ "eval_rewards/margins": 0.04230639338493347,
331
+ "eval_rewards/rejected": -0.07161381840705872,
332
+ "eval_runtime": 342.3556,
333
+ "eval_samples_per_second": 5.842,
334
+ "eval_steps_per_second": 0.184,
335
+ "step": 200
336
+ },
337
+ {
338
+ "epoch": 0.22,
339
+ "learning_rate": 4.79856147322777e-07,
340
+ "logits/chosen": -1.9185651540756226,
341
+ "logits/rejected": -1.6988731622695923,
342
+ "logps/chosen": -368.73504638671875,
343
+ "logps/rejected": -302.3857421875,
344
+ "loss": 0.6743,
345
+ "rewards/accuracies": 0.643750011920929,
346
+ "rewards/chosen": -0.036936286836862564,
347
+ "rewards/margins": 0.04566096514463425,
348
+ "rewards/rejected": -0.08259725570678711,
349
+ "step": 210
350
+ },
351
+ {
352
+ "epoch": 0.23,
353
+ "learning_rate": 4.760892901743944e-07,
354
+ "logits/chosen": -1.9230194091796875,
355
+ "logits/rejected": -1.7622359991073608,
356
+ "logps/chosen": -352.57354736328125,
357
+ "logps/rejected": -324.182373046875,
358
+ "loss": 0.6748,
359
+ "rewards/accuracies": 0.65625,
360
+ "rewards/chosen": -0.03244791179895401,
361
+ "rewards/margins": 0.04869867116212845,
362
+ "rewards/rejected": -0.08114659041166306,
363
+ "step": 220
364
+ },
365
+ {
366
+ "epoch": 0.24,
367
+ "learning_rate": 4.720172231068844e-07,
368
+ "logits/chosen": -1.8404433727264404,
369
+ "logits/rejected": -1.6695423126220703,
370
+ "logps/chosen": -354.01727294921875,
371
+ "logps/rejected": -295.2720642089844,
372
+ "loss": 0.665,
373
+ "rewards/accuracies": 0.668749988079071,
374
+ "rewards/chosen": -0.050288014113903046,
375
+ "rewards/margins": 0.053208403289318085,
376
+ "rewards/rejected": -0.10349641740322113,
377
+ "step": 230
378
+ },
379
+ {
380
+ "epoch": 0.25,
381
+ "learning_rate": 4.6764544321946557e-07,
382
+ "logits/chosen": -1.851320505142212,
383
+ "logits/rejected": -1.6485137939453125,
384
+ "logps/chosen": -338.0115051269531,
385
+ "logps/rejected": -272.17596435546875,
386
+ "loss": 0.6675,
387
+ "rewards/accuracies": 0.6312500238418579,
388
+ "rewards/chosen": -0.04714337736368179,
389
+ "rewards/margins": 0.05400116369128227,
390
+ "rewards/rejected": -0.10114455223083496,
391
+ "step": 240
392
+ },
393
+ {
394
+ "epoch": 0.26,
395
+ "learning_rate": 4.6297985220958176e-07,
396
+ "logits/chosen": -1.858806848526001,
397
+ "logits/rejected": -1.7875276803970337,
398
+ "logps/chosen": -323.6578369140625,
399
+ "logps/rejected": -327.8287048339844,
400
+ "loss": 0.6704,
401
+ "rewards/accuracies": 0.668749988079071,
402
+ "rewards/chosen": -0.06438130885362625,
403
+ "rewards/margins": 0.054077375680208206,
404
+ "rewards/rejected": -0.11845867335796356,
405
+ "step": 250
406
+ },
407
+ {
408
+ "epoch": 0.27,
409
+ "learning_rate": 4.580267484058875e-07,
410
+ "logits/chosen": -1.787245512008667,
411
+ "logits/rejected": -1.6371400356292725,
412
+ "logps/chosen": -346.79132080078125,
413
+ "logps/rejected": -315.600341796875,
414
+ "loss": 0.6655,
415
+ "rewards/accuracies": 0.668749988079071,
416
+ "rewards/chosen": -0.07440805435180664,
417
+ "rewards/margins": 0.07648531347513199,
418
+ "rewards/rejected": -0.15089336037635803,
419
+ "step": 260
420
+ },
421
+ {
422
+ "epoch": 0.28,
423
+ "learning_rate": 4.527928182658005e-07,
424
+ "logits/chosen": -1.9231889247894287,
425
+ "logits/rejected": -1.7725601196289062,
426
+ "logps/chosen": -357.95367431640625,
427
+ "logps/rejected": -305.06793212890625,
428
+ "loss": 0.6624,
429
+ "rewards/accuracies": 0.7124999761581421,
430
+ "rewards/chosen": -0.0633041113615036,
431
+ "rewards/margins": 0.09316254407167435,
432
+ "rewards/rejected": -0.15646666288375854,
433
+ "step": 270
434
+ },
435
+ {
436
+ "epoch": 0.29,
437
+ "learning_rate": 4.472851273490984e-07,
438
+ "logits/chosen": -1.9144046306610107,
439
+ "logits/rejected": -1.8413257598876953,
440
+ "logps/chosen": -356.4827575683594,
441
+ "logps/rejected": -339.57952880859375,
442
+ "loss": 0.6647,
443
+ "rewards/accuracies": 0.706250011920929,
444
+ "rewards/chosen": -0.07623454183340073,
445
+ "rewards/margins": 0.06355991959571838,
446
+ "rewards/rejected": -0.1397944837808609,
447
+ "step": 280
448
+ },
449
+ {
450
+ "epoch": 0.3,
451
+ "learning_rate": 4.415111107797445e-07,
452
+ "logits/chosen": -1.7934792041778564,
453
+ "logits/rejected": -1.7078644037246704,
454
+ "logps/chosen": -343.297607421875,
455
+ "logps/rejected": -307.45904541015625,
456
+ "loss": 0.6574,
457
+ "rewards/accuracies": 0.6000000238418579,
458
+ "rewards/chosen": -0.09327341616153717,
459
+ "rewards/margins": 0.06554602831602097,
460
+ "rewards/rejected": -0.15881945192813873,
461
+ "step": 290
462
+ },
463
+ {
464
+ "epoch": 0.31,
465
+ "learning_rate": 4.3547856320882036e-07,
466
+ "logits/chosen": -1.8048231601715088,
467
+ "logits/rejected": -1.560224175453186,
468
+ "logps/chosen": -344.6669921875,
469
+ "logps/rejected": -290.4841613769531,
470
+ "loss": 0.6499,
471
+ "rewards/accuracies": 0.65625,
472
+ "rewards/chosen": -0.07834906131029129,
473
+ "rewards/margins": 0.09436166286468506,
474
+ "rewards/rejected": -0.17271070182323456,
475
+ "step": 300
476
+ },
477
+ {
478
+ "epoch": 0.31,
479
+ "eval_logits/chosen": -1.868820071220398,
480
+ "eval_logits/rejected": -1.7357724905014038,
481
+ "eval_logps/chosen": -351.38946533203125,
482
+ "eval_logps/rejected": -318.8117980957031,
483
+ "eval_loss": 0.6586803793907166,
484
+ "eval_rewards/accuracies": 0.6686508059501648,
485
+ "eval_rewards/chosen": -0.08751402050256729,
486
+ "eval_rewards/margins": 0.09379409998655319,
487
+ "eval_rewards/rejected": -0.18130813539028168,
488
+ "eval_runtime": 336.7839,
489
+ "eval_samples_per_second": 5.939,
490
+ "eval_steps_per_second": 0.187,
491
+ "step": 300
492
+ },
493
+ {
494
+ "epoch": 0.32,
495
+ "learning_rate": 4.291956282921128e-07,
496
+ "logits/chosen": -1.7621396780014038,
497
+ "logits/rejected": -1.651000738143921,
498
+ "logps/chosen": -348.511474609375,
499
+ "logps/rejected": -330.1929931640625,
500
+ "loss": 0.6514,
501
+ "rewards/accuracies": 0.6937500238418579,
502
+ "rewards/chosen": -0.10528180748224258,
503
+ "rewards/margins": 0.11313805729150772,
504
+ "rewards/rejected": -0.2184198647737503,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 0.33,
509
+ "learning_rate": 4.2267078769656105e-07,
510
+ "logits/chosen": -1.8093763589859009,
511
+ "logits/rejected": -1.733646035194397,
512
+ "logps/chosen": -339.96807861328125,
513
+ "logps/rejected": -329.6910705566406,
514
+ "loss": 0.6485,
515
+ "rewards/accuracies": 0.606249988079071,
516
+ "rewards/chosen": -0.10823850333690643,
517
+ "rewards/margins": 0.10142701864242554,
518
+ "rewards/rejected": -0.20966553688049316,
519
+ "step": 320
520
+ },
521
+ {
522
+ "epoch": 0.35,
523
+ "learning_rate": 4.159128496504053e-07,
524
+ "logits/chosen": -1.8124803304672241,
525
+ "logits/rejected": -1.656519889831543,
526
+ "logps/chosen": -338.48504638671875,
527
+ "logps/rejected": -287.4867248535156,
528
+ "loss": 0.6577,
529
+ "rewards/accuracies": 0.643750011920929,
530
+ "rewards/chosen": -0.09331468492746353,
531
+ "rewards/margins": 0.10502304136753082,
532
+ "rewards/rejected": -0.19833770394325256,
533
+ "step": 330
534
+ },
535
+ {
536
+ "epoch": 0.36,
537
+ "learning_rate": 4.0893093705249207e-07,
538
+ "logits/chosen": -1.7872120141983032,
539
+ "logits/rejected": -1.6502597332000732,
540
+ "logps/chosen": -302.8948669433594,
541
+ "logps/rejected": -278.6997985839844,
542
+ "loss": 0.6597,
543
+ "rewards/accuracies": 0.5562499761581421,
544
+ "rewards/chosen": -0.1376679241657257,
545
+ "rewards/margins": 0.06090687960386276,
546
+ "rewards/rejected": -0.19857481122016907,
547
+ "step": 340
548
+ },
549
+ {
550
+ "epoch": 0.37,
551
+ "learning_rate": 4.0173447515678915e-07,
552
+ "logits/chosen": -1.745719313621521,
553
+ "logits/rejected": -1.7379214763641357,
554
+ "logps/chosen": -341.92218017578125,
555
+ "logps/rejected": -346.43121337890625,
556
+ "loss": 0.6531,
557
+ "rewards/accuracies": 0.6499999761581421,
558
+ "rewards/chosen": -0.1528899073600769,
559
+ "rewards/margins": 0.10737234354019165,
560
+ "rewards/rejected": -0.26026225090026855,
561
+ "step": 350
562
+ },
563
+ {
564
+ "epoch": 0.38,
565
+ "learning_rate": 3.9433317884873665e-07,
566
+ "logits/chosen": -1.896888017654419,
567
+ "logits/rejected": -1.7774041891098022,
568
+ "logps/chosen": -345.6350402832031,
569
+ "logps/rejected": -304.7301025390625,
570
+ "loss": 0.6427,
571
+ "rewards/accuracies": 0.6625000238418579,
572
+ "rewards/chosen": -0.18082933127880096,
573
+ "rewards/margins": 0.10347436368465424,
574
+ "rewards/rejected": -0.2843037247657776,
575
+ "step": 360
576
+ },
577
+ {
578
+ "epoch": 0.39,
579
+ "learning_rate": 3.867370395306068e-07,
580
+ "logits/chosen": -1.7397247552871704,
581
+ "logits/rejected": -1.6811482906341553,
582
+ "logps/chosen": -297.6015625,
583
+ "logps/rejected": -314.14093017578125,
584
+ "loss": 0.6415,
585
+ "rewards/accuracies": 0.675000011920929,
586
+ "rewards/chosen": -0.17070330679416656,
587
+ "rewards/margins": 0.12213484942913055,
588
+ "rewards/rejected": -0.2928381562232971,
589
+ "step": 370
590
+ },
591
+ {
592
+ "epoch": 0.4,
593
+ "learning_rate": 3.78956311633581e-07,
594
+ "logits/chosen": -1.9287761449813843,
595
+ "logits/rejected": -1.7968651056289673,
596
+ "logps/chosen": -369.02142333984375,
597
+ "logps/rejected": -335.91986083984375,
598
+ "loss": 0.6426,
599
+ "rewards/accuracies": 0.668749988079071,
600
+ "rewards/chosen": -0.1528002917766571,
601
+ "rewards/margins": 0.15230461955070496,
602
+ "rewards/rejected": -0.30510494112968445,
603
+ "step": 380
604
+ },
605
+ {
606
+ "epoch": 0.41,
607
+ "learning_rate": 3.7100149877474976e-07,
608
+ "logits/chosen": -1.7732282876968384,
609
+ "logits/rejected": -1.6088358163833618,
610
+ "logps/chosen": -340.55804443359375,
611
+ "logps/rejected": -296.6915283203125,
612
+ "loss": 0.6505,
613
+ "rewards/accuracies": 0.6937500238418579,
614
+ "rewards/chosen": -0.1547761708498001,
615
+ "rewards/margins": 0.11781679093837738,
616
+ "rewards/rejected": -0.2725929617881775,
617
+ "step": 390
618
+ },
619
+ {
620
+ "epoch": 0.42,
621
+ "learning_rate": 3.6288333957772234e-07,
622
+ "logits/chosen": -1.9192142486572266,
623
+ "logits/rejected": -1.7736551761627197,
624
+ "logps/chosen": -382.8447265625,
625
+ "logps/rejected": -338.8296813964844,
626
+ "loss": 0.6374,
627
+ "rewards/accuracies": 0.668749988079071,
628
+ "rewards/chosen": -0.18712495267391205,
629
+ "rewards/margins": 0.15186870098114014,
630
+ "rewards/rejected": -0.338993638753891,
631
+ "step": 400
632
+ },
633
+ {
634
+ "epoch": 0.42,
635
+ "eval_logits/chosen": -1.848189353942871,
636
+ "eval_logits/rejected": -1.7164283990859985,
637
+ "eval_logps/chosen": -359.8952941894531,
638
+ "eval_logps/rejected": -332.86322021484375,
639
+ "eval_loss": 0.6451287865638733,
640
+ "eval_rewards/accuracies": 0.6746031641960144,
641
+ "eval_rewards/chosen": -0.1725723147392273,
642
+ "eval_rewards/margins": 0.14925029873847961,
643
+ "eval_rewards/rejected": -0.3218226134777069,
644
+ "eval_runtime": 336.4318,
645
+ "eval_samples_per_second": 5.945,
646
+ "eval_steps_per_second": 0.187,
647
+ "step": 400
648
+ },
649
+ {
650
+ "epoch": 0.43,
651
+ "learning_rate": 3.5461279317599025e-07,
652
+ "logits/chosen": -1.9535846710205078,
653
+ "logits/rejected": -1.8399330377578735,
654
+ "logps/chosen": -380.81396484375,
655
+ "logps/rejected": -362.853759765625,
656
+ "loss": 0.6402,
657
+ "rewards/accuracies": 0.6625000238418579,
658
+ "rewards/chosen": -0.1376427710056305,
659
+ "rewards/margins": 0.15401354432106018,
660
+ "rewards/rejected": -0.2916563153266907,
661
+ "step": 410
662
+ },
663
+ {
664
+ "epoch": 0.44,
665
+ "learning_rate": 3.4620102441861144e-07,
666
+ "logits/chosen": -1.7898231744766235,
667
+ "logits/rejected": -1.6419872045516968,
668
+ "logps/chosen": -319.6932678222656,
669
+ "logps/rejected": -280.15960693359375,
670
+ "loss": 0.6387,
671
+ "rewards/accuracies": 0.643750011920929,
672
+ "rewards/chosen": -0.14749577641487122,
673
+ "rewards/margins": 0.12767234444618225,
674
+ "rewards/rejected": -0.2751680910587311,
675
+ "step": 420
676
+ },
677
+ {
678
+ "epoch": 0.45,
679
+ "learning_rate": 3.376593887981886e-07,
680
+ "logits/chosen": -1.7821381092071533,
681
+ "logits/rejected": -1.6903321743011475,
682
+ "logps/chosen": -329.77142333984375,
683
+ "logps/rejected": -305.03271484375,
684
+ "loss": 0.6447,
685
+ "rewards/accuracies": 0.59375,
686
+ "rewards/chosen": -0.20932936668395996,
687
+ "rewards/margins": 0.10754366964101791,
688
+ "rewards/rejected": -0.3168730139732361,
689
+ "step": 430
690
+ },
691
+ {
692
+ "epoch": 0.46,
693
+ "learning_rate": 3.2899941712148813e-07,
694
+ "logits/chosen": -1.7692524194717407,
695
+ "logits/rejected": -1.703125,
696
+ "logps/chosen": -330.9040222167969,
697
+ "logps/rejected": -317.64447021484375,
698
+ "loss": 0.6428,
699
+ "rewards/accuracies": 0.5874999761581421,
700
+ "rewards/chosen": -0.2360539734363556,
701
+ "rewards/margins": 0.1059914231300354,
702
+ "rewards/rejected": -0.342045396566391,
703
+ "step": 440
704
+ },
705
+ {
706
+ "epoch": 0.47,
707
+ "learning_rate": 3.2023279994339236e-07,
708
+ "logits/chosen": -1.6386339664459229,
709
+ "logits/rejected": -1.5296618938446045,
710
+ "logps/chosen": -331.60919189453125,
711
+ "logps/rejected": -312.84686279296875,
712
+ "loss": 0.6366,
713
+ "rewards/accuracies": 0.6812499761581421,
714
+ "rewards/chosen": -0.18304389715194702,
715
+ "rewards/margins": 0.18606719374656677,
716
+ "rewards/rejected": -0.3691111207008362,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 0.48,
721
+ "learning_rate": 3.1137137178519977e-07,
722
+ "logits/chosen": -1.8083422183990479,
723
+ "logits/rejected": -1.666670799255371,
724
+ "logps/chosen": -337.09991455078125,
725
+ "logps/rejected": -324.5543518066406,
726
+ "loss": 0.6313,
727
+ "rewards/accuracies": 0.581250011920929,
728
+ "rewards/chosen": -0.19549410045146942,
729
+ "rewards/margins": 0.13338689506053925,
730
+ "rewards/rejected": -0.32888099551200867,
731
+ "step": 460
732
+ },
733
+ {
734
+ "epoch": 0.49,
735
+ "learning_rate": 3.024270951585776e-07,
736
+ "logits/chosen": -1.8487087488174438,
737
+ "logits/rejected": -1.6575820446014404,
738
+ "logps/chosen": -383.4247131347656,
739
+ "logps/rejected": -340.35845947265625,
740
+ "loss": 0.6264,
741
+ "rewards/accuracies": 0.7437499761581421,
742
+ "rewards/chosen": -0.2236436903476715,
743
+ "rewards/margins": 0.2502152621746063,
744
+ "rewards/rejected": -0.4738590121269226,
745
+ "step": 470
746
+ },
747
+ {
748
+ "epoch": 0.5,
749
+ "learning_rate": 2.934120444167326e-07,
750
+ "logits/chosen": -1.6958894729614258,
751
+ "logits/rejected": -1.634203553199768,
752
+ "logps/chosen": -303.4564208984375,
753
+ "logps/rejected": -303.09417724609375,
754
+ "loss": 0.6552,
755
+ "rewards/accuracies": 0.612500011920929,
756
+ "rewards/chosen": -0.2998581528663635,
757
+ "rewards/margins": 0.08801662921905518,
758
+ "rewards/rejected": -0.3878747820854187,
759
+ "step": 480
760
+ },
761
+ {
762
+ "epoch": 0.51,
763
+ "learning_rate": 2.8433838945460205e-07,
764
+ "logits/chosen": -1.7905118465423584,
765
+ "logits/rejected": -1.6190725564956665,
766
+ "logps/chosen": -353.28582763671875,
767
+ "logps/rejected": -327.52947998046875,
768
+ "loss": 0.6305,
769
+ "rewards/accuracies": 0.643750011920929,
770
+ "rewards/chosen": -0.24126093089580536,
771
+ "rewards/margins": 0.1726113259792328,
772
+ "rewards/rejected": -0.41387230157852173,
773
+ "step": 490
774
+ },
775
+ {
776
+ "epoch": 0.52,
777
+ "learning_rate": 2.752183792800671e-07,
778
+ "logits/chosen": -1.780226469039917,
779
+ "logits/rejected": -1.6524947881698608,
780
+ "logps/chosen": -338.72564697265625,
781
+ "logps/rejected": -325.1302185058594,
782
+ "loss": 0.6348,
783
+ "rewards/accuracies": 0.65625,
784
+ "rewards/chosen": -0.2332509458065033,
785
+ "rewards/margins": 0.17690421640872955,
786
+ "rewards/rejected": -0.41015520691871643,
787
+ "step": 500
788
+ },
789
+ {
790
+ "epoch": 0.52,
791
+ "eval_logits/chosen": -1.8207694292068481,
792
+ "eval_logits/rejected": -1.6884433031082153,
793
+ "eval_logps/chosen": -369.601318359375,
794
+ "eval_logps/rejected": -346.1808166503906,
795
+ "eval_loss": 0.6377259492874146,
796
+ "eval_rewards/accuracies": 0.6646825671195984,
797
+ "eval_rewards/chosen": -0.26963263750076294,
798
+ "eval_rewards/margins": 0.18536561727523804,
799
+ "eval_rewards/rejected": -0.454998254776001,
800
+ "eval_runtime": 336.8999,
801
+ "eval_samples_per_second": 5.936,
802
+ "eval_steps_per_second": 0.187,
803
+ "step": 500
804
+ },
805
+ {
806
+ "epoch": 0.53,
807
+ "learning_rate": 2.6606432547836753e-07,
808
+ "logits/chosen": -1.7960855960845947,
809
+ "logits/rejected": -1.5604727268218994,
810
+ "logps/chosen": -385.92523193359375,
811
+ "logps/rejected": -332.73236083984375,
812
+ "loss": 0.6292,
813
+ "rewards/accuracies": 0.643750011920929,
814
+ "rewards/chosen": -0.2777213156223297,
815
+ "rewards/margins": 0.21707573533058167,
816
+ "rewards/rejected": -0.4947970509529114,
817
+ "step": 510
818
+ },
819
+ {
820
+ "epoch": 0.54,
821
+ "learning_rate": 2.5688858559204053e-07,
822
+ "logits/chosen": -1.7386023998260498,
823
+ "logits/rejected": -1.650543212890625,
824
+ "logps/chosen": -324.80194091796875,
825
+ "logps/rejected": -326.7395935058594,
826
+ "loss": 0.6328,
827
+ "rewards/accuracies": 0.7124999761581421,
828
+ "rewards/chosen": -0.22357122600078583,
829
+ "rewards/margins": 0.23022878170013428,
830
+ "rewards/rejected": -0.4538000226020813,
831
+ "step": 520
832
+ },
833
+ {
834
+ "epoch": 0.55,
835
+ "learning_rate": 2.477035464388184e-07,
836
+ "logits/chosen": -1.6688206195831299,
837
+ "logits/rejected": -1.5695548057556152,
838
+ "logps/chosen": -361.03753662109375,
839
+ "logps/rejected": -339.63226318359375,
840
+ "loss": 0.6309,
841
+ "rewards/accuracies": 0.625,
842
+ "rewards/chosen": -0.25370585918426514,
843
+ "rewards/margins": 0.17806772887706757,
844
+ "rewards/rejected": -0.4317736029624939,
845
+ "step": 530
846
+ },
847
+ {
848
+ "epoch": 0.57,
849
+ "learning_rate": 2.3852160739000706e-07,
850
+ "logits/chosen": -1.827457070350647,
851
+ "logits/rejected": -1.7111104726791382,
852
+ "logps/chosen": -378.85089111328125,
853
+ "logps/rejected": -386.2629699707031,
854
+ "loss": 0.6217,
855
+ "rewards/accuracies": 0.65625,
856
+ "rewards/chosen": -0.24295127391815186,
857
+ "rewards/margins": 0.199497789144516,
858
+ "rewards/rejected": -0.44244909286499023,
859
+ "step": 540
860
+ },
861
+ {
862
+ "epoch": 0.58,
863
+ "learning_rate": 2.2935516363191693e-07,
864
+ "logits/chosen": -1.7694709300994873,
865
+ "logits/rejected": -1.6132080554962158,
866
+ "logps/chosen": -350.4745788574219,
867
+ "logps/rejected": -334.8966369628906,
868
+ "loss": 0.6415,
869
+ "rewards/accuracies": 0.6875,
870
+ "rewards/chosen": -0.30643701553344727,
871
+ "rewards/margins": 0.2043788731098175,
872
+ "rewards/rejected": -0.5108158588409424,
873
+ "step": 550
874
+ },
875
+ {
876
+ "epoch": 0.59,
877
+ "learning_rate": 2.2021658943294407e-07,
878
+ "logits/chosen": -1.7891242504119873,
879
+ "logits/rejected": -1.6618837118148804,
880
+ "logps/chosen": -348.5799865722656,
881
+ "logps/rejected": -320.85675048828125,
882
+ "loss": 0.6252,
883
+ "rewards/accuracies": 0.675000011920929,
884
+ "rewards/chosen": -0.30380532145500183,
885
+ "rewards/margins": 0.2296813428401947,
886
+ "rewards/rejected": -0.5334866642951965,
887
+ "step": 560
888
+ },
889
+ {
890
+ "epoch": 0.6,
891
+ "learning_rate": 2.1111822143888928e-07,
892
+ "logits/chosen": -1.8157535791397095,
893
+ "logits/rejected": -1.6647037267684937,
894
+ "logps/chosen": -365.7635192871094,
895
+ "logps/rejected": -348.455810546875,
896
+ "loss": 0.6396,
897
+ "rewards/accuracies": 0.675000011920929,
898
+ "rewards/chosen": -0.3315108120441437,
899
+ "rewards/margins": 0.16533556580543518,
900
+ "rewards/rejected": -0.49684637784957886,
901
+ "step": 570
902
+ },
903
+ {
904
+ "epoch": 0.61,
905
+ "learning_rate": 2.0207234201906545e-07,
906
+ "logits/chosen": -1.6991872787475586,
907
+ "logits/rejected": -1.5607801675796509,
908
+ "logps/chosen": -355.08758544921875,
909
+ "logps/rejected": -318.2015075683594,
910
+ "loss": 0.6263,
911
+ "rewards/accuracies": 0.6499999761581421,
912
+ "rewards/chosen": -0.32228100299835205,
913
+ "rewards/margins": 0.16686378419399261,
914
+ "rewards/rejected": -0.48914486169815063,
915
+ "step": 580
916
+ },
917
+ {
918
+ "epoch": 0.62,
919
+ "learning_rate": 1.9309116268567671e-07,
920
+ "logits/chosen": -1.8247711658477783,
921
+ "logits/rejected": -1.7237234115600586,
922
+ "logps/chosen": -346.73529052734375,
923
+ "logps/rejected": -332.7265625,
924
+ "loss": 0.631,
925
+ "rewards/accuracies": 0.6000000238418579,
926
+ "rewards/chosen": -0.2523882985115051,
927
+ "rewards/margins": 0.11806211620569229,
928
+ "rewards/rejected": -0.37045034766197205,
929
+ "step": 590
930
+ },
931
+ {
932
+ "epoch": 0.63,
933
+ "learning_rate": 1.8418680760885024e-07,
934
+ "logits/chosen": -1.803122878074646,
935
+ "logits/rejected": -1.6771843433380127,
936
+ "logps/chosen": -412.44671630859375,
937
+ "logps/rejected": -402.94281005859375,
938
+ "loss": 0.6308,
939
+ "rewards/accuracies": 0.6937500238418579,
940
+ "rewards/chosen": -0.22780685126781464,
941
+ "rewards/margins": 0.22766557335853577,
942
+ "rewards/rejected": -0.455472469329834,
943
+ "step": 600
944
+ },
945
+ {
946
+ "epoch": 0.63,
947
+ "eval_logits/chosen": -1.8269299268722534,
948
+ "eval_logits/rejected": -1.6965351104736328,
949
+ "eval_logps/chosen": -370.46728515625,
950
+ "eval_logps/rejected": -348.8290710449219,
951
+ "eval_loss": 0.6333169937133789,
952
+ "eval_rewards/accuracies": 0.6726190447807312,
953
+ "eval_rewards/chosen": -0.27829232811927795,
954
+ "eval_rewards/margins": 0.2031887322664261,
955
+ "eval_rewards/rejected": -0.48148107528686523,
956
+ "eval_runtime": 333.0266,
957
+ "eval_samples_per_second": 6.006,
958
+ "eval_steps_per_second": 0.189,
959
+ "step": 600
960
+ },
961
+ {
962
+ "epoch": 0.64,
963
+ "learning_rate": 1.753712972495764e-07,
964
+ "logits/chosen": -1.858438491821289,
965
+ "logits/rejected": -1.6571956872940063,
966
+ "logps/chosen": -356.0987854003906,
967
+ "logps/rejected": -304.0067138671875,
968
+ "loss": 0.6257,
969
+ "rewards/accuracies": 0.699999988079071,
970
+ "rewards/chosen": -0.263310968875885,
971
+ "rewards/margins": 0.19186006486415863,
972
+ "rewards/rejected": -0.45517101883888245,
973
+ "step": 610
974
+ },
975
+ {
976
+ "epoch": 0.65,
977
+ "learning_rate": 1.666565321326512e-07,
978
+ "logits/chosen": -1.8659627437591553,
979
+ "logits/rejected": -1.709458589553833,
980
+ "logps/chosen": -364.3775634765625,
981
+ "logps/rejected": -322.25396728515625,
982
+ "loss": 0.6239,
983
+ "rewards/accuracies": 0.675000011920929,
984
+ "rewards/chosen": -0.3321610391139984,
985
+ "rewards/margins": 0.1703757643699646,
986
+ "rewards/rejected": -0.5025367736816406,
987
+ "step": 620
988
+ },
989
+ {
990
+ "epoch": 0.66,
991
+ "learning_rate": 1.5805427678152674e-07,
992
+ "logits/chosen": -1.641649603843689,
993
+ "logits/rejected": -1.5464293956756592,
994
+ "logps/chosen": -341.80950927734375,
995
+ "logps/rejected": -339.9558410644531,
996
+ "loss": 0.6251,
997
+ "rewards/accuracies": 0.6499999761581421,
998
+ "rewards/chosen": -0.2863956093788147,
999
+ "rewards/margins": 0.20320038497447968,
1000
+ "rewards/rejected": -0.4895959794521332,
1001
+ "step": 630
1002
+ },
1003
+ {
1004
+ "epoch": 0.67,
1005
+ "learning_rate": 1.4957614383675767e-07,
1006
+ "logits/chosen": -1.7454828023910522,
1007
+ "logits/rejected": -1.5985407829284668,
1008
+ "logps/chosen": -347.81488037109375,
1009
+ "logps/rejected": -337.49676513671875,
1010
+ "loss": 0.6222,
1011
+ "rewards/accuracies": 0.7124999761581421,
1012
+ "rewards/chosen": -0.264575332403183,
1013
+ "rewards/margins": 0.2227667272090912,
1014
+ "rewards/rejected": -0.48734205961227417,
1015
+ "step": 640
1016
+ },
1017
+ {
1018
+ "epoch": 0.68,
1019
+ "learning_rate": 1.4123357837948176e-07,
1020
+ "logits/chosen": -1.9199473857879639,
1021
+ "logits/rejected": -1.715175986289978,
1022
+ "logps/chosen": -401.645263671875,
1023
+ "logps/rejected": -365.5584411621094,
1024
+ "loss": 0.6101,
1025
+ "rewards/accuracies": 0.65625,
1026
+ "rewards/chosen": -0.28458407521247864,
1027
+ "rewards/margins": 0.24697256088256836,
1028
+ "rewards/rejected": -0.5315566062927246,
1029
+ "step": 650
1030
+ },
1031
+ {
1032
+ "epoch": 0.69,
1033
+ "learning_rate": 1.3303784248109808e-07,
1034
+ "logits/chosen": -1.8878719806671143,
1035
+ "logits/rejected": -1.8138688802719116,
1036
+ "logps/chosen": -379.57672119140625,
1037
+ "logps/rejected": -363.7694091796875,
1038
+ "loss": 0.6202,
1039
+ "rewards/accuracies": 0.6812499761581421,
1040
+ "rewards/chosen": -0.2839195132255554,
1041
+ "rewards/margins": 0.23723697662353516,
1042
+ "rewards/rejected": -0.5211564898490906,
1043
+ "step": 660
1044
+ },
1045
+ {
1046
+ "epoch": 0.7,
1047
+ "learning_rate": 1.2500000000000005e-07,
1048
+ "logits/chosen": -1.7933881282806396,
1049
+ "logits/rejected": -1.708486557006836,
1050
+ "logps/chosen": -361.3472595214844,
1051
+ "logps/rejected": -356.447509765625,
1052
+ "loss": 0.6204,
1053
+ "rewards/accuracies": 0.637499988079071,
1054
+ "rewards/chosen": -0.2589954733848572,
1055
+ "rewards/margins": 0.21957698464393616,
1056
+ "rewards/rejected": -0.47857245802879333,
1057
+ "step": 670
1058
+ },
1059
+ {
1060
+ "epoch": 0.71,
1061
+ "learning_rate": 1.1713090164588606e-07,
1062
+ "logits/chosen": -1.675574541091919,
1063
+ "logits/rejected": -1.6282098293304443,
1064
+ "logps/chosen": -336.2772521972656,
1065
+ "logps/rejected": -314.0538330078125,
1066
+ "loss": 0.6279,
1067
+ "rewards/accuracies": 0.5874999761581421,
1068
+ "rewards/chosen": -0.24504590034484863,
1069
+ "rewards/margins": 0.09308433532714844,
1070
+ "rewards/rejected": -0.3381302058696747,
1071
+ "step": 680
1072
+ },
1073
+ {
1074
+ "epoch": 0.72,
1075
+ "learning_rate": 1.094411703318115e-07,
1076
+ "logits/chosen": -1.7943480014801025,
1077
+ "logits/rejected": -1.650317907333374,
1078
+ "logps/chosen": -339.38836669921875,
1079
+ "logps/rejected": -322.80755615234375,
1080
+ "loss": 0.6253,
1081
+ "rewards/accuracies": 0.675000011920929,
1082
+ "rewards/chosen": -0.22786399722099304,
1083
+ "rewards/margins": 0.1742563247680664,
1084
+ "rewards/rejected": -0.40212029218673706,
1085
+ "step": 690
1086
+ },
1087
+ {
1088
+ "epoch": 0.73,
1089
+ "learning_rate": 1.0194118683375502e-07,
1090
+ "logits/chosen": -1.752105951309204,
1091
+ "logits/rejected": -1.5573476552963257,
1092
+ "logps/chosen": -324.02362060546875,
1093
+ "logps/rejected": -319.4602355957031,
1094
+ "loss": 0.62,
1095
+ "rewards/accuracies": 0.643750011920929,
1096
+ "rewards/chosen": -0.25912636518478394,
1097
+ "rewards/margins": 0.1973043978214264,
1098
+ "rewards/rejected": -0.4564308226108551,
1099
+ "step": 700
1100
+ },
1101
+ {
1102
+ "epoch": 0.73,
1103
+ "eval_logits/chosen": -1.8149123191833496,
1104
+ "eval_logits/rejected": -1.6841386556625366,
1105
+ "eval_logps/chosen": -365.8656005859375,
1106
+ "eval_logps/rejected": -345.7306213378906,
1107
+ "eval_loss": 0.63123619556427,
1108
+ "eval_rewards/accuracies": 0.6805555820465088,
1109
+ "eval_rewards/chosen": -0.2322753369808197,
1110
+ "eval_rewards/margins": 0.21822109818458557,
1111
+ "eval_rewards/rejected": -0.45049646496772766,
1112
+ "eval_runtime": 334.8941,
1113
+ "eval_samples_per_second": 5.972,
1114
+ "eval_steps_per_second": 0.188,
1115
+ "step": 700
1116
+ },
1117
+ {
1118
+ "epoch": 0.74,
1119
+ "learning_rate": 9.464107577705886e-08,
1120
+ "logits/chosen": -1.8157602548599243,
1121
+ "logits/rejected": -1.607846975326538,
1122
+ "logps/chosen": -384.46343994140625,
1123
+ "logps/rejected": -319.81805419921875,
1124
+ "loss": 0.6142,
1125
+ "rewards/accuracies": 0.6812499761581421,
1126
+ "rewards/chosen": -0.23041348159313202,
1127
+ "rewards/margins": 0.2244310826063156,
1128
+ "rewards/rejected": -0.45484456419944763,
1129
+ "step": 710
1130
+ },
1131
+ {
1132
+ "epoch": 0.75,
1133
+ "learning_rate": 8.755069196866013e-08,
1134
+ "logits/chosen": -1.7827469110488892,
1135
+ "logits/rejected": -1.6819953918457031,
1136
+ "logps/chosen": -368.9613952636719,
1137
+ "logps/rejected": -345.7911071777344,
1138
+ "loss": 0.6334,
1139
+ "rewards/accuracies": 0.6499999761581421,
1140
+ "rewards/chosen": -0.24999408423900604,
1141
+ "rewards/margins": 0.18186303973197937,
1142
+ "rewards/rejected": -0.431857168674469,
1143
+ "step": 720
1144
+ },
1145
+ {
1146
+ "epoch": 0.76,
1147
+ "learning_rate": 8.067960709356478e-08,
1148
+ "logits/chosen": -1.8376483917236328,
1149
+ "logits/rejected": -1.677075982093811,
1150
+ "logps/chosen": -381.3594665527344,
1151
+ "logps/rejected": -341.1945495605469,
1152
+ "loss": 0.6275,
1153
+ "rewards/accuracies": 0.6812499761581421,
1154
+ "rewards/chosen": -0.25991183519363403,
1155
+ "rewards/margins": 0.2162000834941864,
1156
+ "rewards/rejected": -0.47611188888549805,
1157
+ "step": 730
1158
+ },
1159
+ {
1160
+ "epoch": 0.77,
1161
+ "learning_rate": 7.403709679352216e-08,
1162
+ "logits/chosen": -1.7825815677642822,
1163
+ "logits/rejected": -1.5635161399841309,
1164
+ "logps/chosen": -405.16839599609375,
1165
+ "logps/rejected": -342.57476806640625,
1166
+ "loss": 0.6172,
1167
+ "rewards/accuracies": 0.731249988079071,
1168
+ "rewards/chosen": -0.2256811410188675,
1169
+ "rewards/margins": 0.23789973556995392,
1170
+ "rewards/rejected": -0.4635809063911438,
1171
+ "step": 740
1172
+ },
1173
+ {
1174
+ "epoch": 0.78,
1175
+ "learning_rate": 6.763212814534483e-08,
1176
+ "logits/chosen": -1.8319247961044312,
1177
+ "logits/rejected": -1.7559360265731812,
1178
+ "logps/chosen": -367.5751647949219,
1179
+ "logps/rejected": -367.4506530761719,
1180
+ "loss": 0.6208,
1181
+ "rewards/accuracies": 0.65625,
1182
+ "rewards/chosen": -0.29129502177238464,
1183
+ "rewards/margins": 0.18480955064296722,
1184
+ "rewards/rejected": -0.4761045575141907,
1185
+ "step": 750
1186
+ },
1187
+ {
1188
+ "epoch": 0.8,
1189
+ "learning_rate": 6.147334755577596e-08,
1190
+ "logits/chosen": -1.729148268699646,
1191
+ "logits/rejected": -1.6354280710220337,
1192
+ "logps/chosen": -353.6579895019531,
1193
+ "logps/rejected": -341.3516845703125,
1194
+ "loss": 0.6157,
1195
+ "rewards/accuracies": 0.65625,
1196
+ "rewards/chosen": -0.3305814862251282,
1197
+ "rewards/margins": 0.20686399936676025,
1198
+ "rewards/rejected": -0.5374454259872437,
1199
+ "step": 760
1200
+ },
1201
+ {
1202
+ "epoch": 0.81,
1203
+ "learning_rate": 5.556906908924655e-08,
1204
+ "logits/chosen": -1.774147629737854,
1205
+ "logits/rejected": -1.7218784093856812,
1206
+ "logps/chosen": -382.23968505859375,
1207
+ "logps/rejected": -393.50579833984375,
1208
+ "loss": 0.621,
1209
+ "rewards/accuracies": 0.6499999761581421,
1210
+ "rewards/chosen": -0.30208078026771545,
1211
+ "rewards/margins": 0.19531475007534027,
1212
+ "rewards/rejected": -0.49739551544189453,
1213
+ "step": 770
1214
+ },
1215
+ {
1216
+ "epoch": 0.82,
1217
+ "learning_rate": 4.992726324427901e-08,
1218
+ "logits/chosen": -1.6901906728744507,
1219
+ "logits/rejected": -1.58046555519104,
1220
+ "logps/chosen": -335.16583251953125,
1221
+ "logps/rejected": -316.52978515625,
1222
+ "loss": 0.6264,
1223
+ "rewards/accuracies": 0.581250011920929,
1224
+ "rewards/chosen": -0.2825678884983063,
1225
+ "rewards/margins": 0.153324693441391,
1226
+ "rewards/rejected": -0.4358925223350525,
1227
+ "step": 780
1228
+ },
1229
+ {
1230
+ "epoch": 0.83,
1231
+ "learning_rate": 4.4555546193688734e-08,
1232
+ "logits/chosen": -1.795194387435913,
1233
+ "logits/rejected": -1.722459077835083,
1234
+ "logps/chosen": -390.23846435546875,
1235
+ "logps/rejected": -379.72332763671875,
1236
+ "loss": 0.635,
1237
+ "rewards/accuracies": 0.65625,
1238
+ "rewards/chosen": -0.29042816162109375,
1239
+ "rewards/margins": 0.2163534164428711,
1240
+ "rewards/rejected": -0.5067815184593201,
1241
+ "step": 790
1242
+ },
1243
+ {
1244
+ "epoch": 0.84,
1245
+ "learning_rate": 3.94611695031086e-08,
1246
+ "logits/chosen": -1.743678331375122,
1247
+ "logits/rejected": -1.5813570022583008,
1248
+ "logps/chosen": -426.080078125,
1249
+ "logps/rejected": -366.8641662597656,
1250
+ "loss": 0.6055,
1251
+ "rewards/accuracies": 0.699999988079071,
1252
+ "rewards/chosen": -0.26492840051651,
1253
+ "rewards/margins": 0.23690399527549744,
1254
+ "rewards/rejected": -0.5018323659896851,
1255
+ "step": 800
1256
+ },
1257
+ {
1258
+ "epoch": 0.84,
1259
+ "eval_logits/chosen": -1.809856653213501,
1260
+ "eval_logits/rejected": -1.6792672872543335,
1261
+ "eval_logps/chosen": -371.4098815917969,
1262
+ "eval_logps/rejected": -352.3697204589844,
1263
+ "eval_loss": 0.6287124156951904,
1264
+ "eval_rewards/accuracies": 0.6865079402923584,
1265
+ "eval_rewards/chosen": -0.2877180278301239,
1266
+ "eval_rewards/margins": 0.2291695475578308,
1267
+ "eval_rewards/rejected": -0.5168876051902771,
1268
+ "eval_runtime": 335.4301,
1269
+ "eval_samples_per_second": 5.962,
1270
+ "eval_steps_per_second": 0.188,
1271
+ "step": 800
1272
+ },
1273
+ {
1274
+ "epoch": 0.85,
1275
+ "learning_rate": 3.465101034171603e-08,
1276
+ "logits/chosen": -1.8098558187484741,
1277
+ "logits/rejected": -1.5995447635650635,
1278
+ "logps/chosen": -368.4771423339844,
1279
+ "logps/rejected": -318.16156005859375,
1280
+ "loss": 0.6273,
1281
+ "rewards/accuracies": 0.612500011920929,
1282
+ "rewards/chosen": -0.309670627117157,
1283
+ "rewards/margins": 0.13772638142108917,
1284
+ "rewards/rejected": -0.44739705324172974,
1285
+ "step": 810
1286
+ },
1287
+ {
1288
+ "epoch": 0.86,
1289
+ "learning_rate": 3.013156219837776e-08,
1290
+ "logits/chosen": -1.638256311416626,
1291
+ "logits/rejected": -1.5319178104400635,
1292
+ "logps/chosen": -319.73828125,
1293
+ "logps/rejected": -316.17022705078125,
1294
+ "loss": 0.6382,
1295
+ "rewards/accuracies": 0.6937500238418579,
1296
+ "rewards/chosen": -0.28010720014572144,
1297
+ "rewards/margins": 0.2056259661912918,
1298
+ "rewards/rejected": -0.48573318123817444,
1299
+ "step": 820
1300
+ },
1301
+ {
1302
+ "epoch": 0.87,
1303
+ "learning_rate": 2.5908926115744994e-08,
1304
+ "logits/chosen": -1.6816116571426392,
1305
+ "logits/rejected": -1.6694707870483398,
1306
+ "logps/chosen": -323.09210205078125,
1307
+ "logps/rejected": -325.642822265625,
1308
+ "loss": 0.6157,
1309
+ "rewards/accuracies": 0.737500011920929,
1310
+ "rewards/chosen": -0.2693836987018585,
1311
+ "rewards/margins": 0.23751957714557648,
1312
+ "rewards/rejected": -0.5069032907485962,
1313
+ "step": 830
1314
+ },
1315
+ {
1316
+ "epoch": 0.88,
1317
+ "learning_rate": 2.19888024541324e-08,
1318
+ "logits/chosen": -1.7210891246795654,
1319
+ "logits/rejected": -1.6208484172821045,
1320
+ "logps/chosen": -391.98187255859375,
1321
+ "logps/rejected": -361.2024841308594,
1322
+ "loss": 0.6279,
1323
+ "rewards/accuracies": 0.574999988079071,
1324
+ "rewards/chosen": -0.33281436562538147,
1325
+ "rewards/margins": 0.1134597659111023,
1326
+ "rewards/rejected": -0.4462741017341614,
1327
+ "step": 840
1328
+ },
1329
+ {
1330
+ "epoch": 0.89,
1331
+ "learning_rate": 1.8376483196299558e-08,
1332
+ "logits/chosen": -1.8490521907806396,
1333
+ "logits/rejected": -1.7024778127670288,
1334
+ "logps/chosen": -389.59765625,
1335
+ "logps/rejected": -336.8246154785156,
1336
+ "loss": 0.6253,
1337
+ "rewards/accuracies": 0.675000011920929,
1338
+ "rewards/chosen": -0.25476616621017456,
1339
+ "rewards/margins": 0.24750569462776184,
1340
+ "rewards/rejected": -0.502271831035614,
1341
+ "step": 850
1342
+ },
1343
+ {
1344
+ "epoch": 0.9,
1345
+ "learning_rate": 1.507684480352292e-08,
1346
+ "logits/chosen": -1.7873809337615967,
1347
+ "logits/rejected": -1.711810827255249,
1348
+ "logps/chosen": -362.58319091796875,
1349
+ "logps/rejected": -348.2912292480469,
1350
+ "loss": 0.6287,
1351
+ "rewards/accuracies": 0.6812499761581421,
1352
+ "rewards/chosen": -0.2954009175300598,
1353
+ "rewards/margins": 0.25710874795913696,
1354
+ "rewards/rejected": -0.5525097846984863,
1355
+ "step": 860
1356
+ },
1357
+ {
1358
+ "epoch": 0.91,
1359
+ "learning_rate": 1.2094341632602062e-08,
1360
+ "logits/chosen": -1.7503105401992798,
1361
+ "logits/rejected": -1.698735237121582,
1362
+ "logps/chosen": -318.9074401855469,
1363
+ "logps/rejected": -333.00909423828125,
1364
+ "loss": 0.6184,
1365
+ "rewards/accuracies": 0.6937500238418579,
1366
+ "rewards/chosen": -0.25144487619400024,
1367
+ "rewards/margins": 0.24724820256233215,
1368
+ "rewards/rejected": -0.49869304895401,
1369
+ "step": 870
1370
+ },
1371
+ {
1372
+ "epoch": 0.92,
1373
+ "learning_rate": 9.432999922687396e-09,
1374
+ "logits/chosen": -1.721398115158081,
1375
+ "logits/rejected": -1.566706895828247,
1376
+ "logps/chosen": -368.14556884765625,
1377
+ "logps/rejected": -340.03277587890625,
1378
+ "loss": 0.6259,
1379
+ "rewards/accuracies": 0.6499999761581421,
1380
+ "rewards/chosen": -0.3243555426597595,
1381
+ "rewards/margins": 0.1919844150543213,
1382
+ "rewards/rejected": -0.516339898109436,
1383
+ "step": 880
1384
+ },
1385
+ {
1386
+ "epoch": 0.93,
1387
+ "learning_rate": 7.096412360046544e-09,
1388
+ "logits/chosen": -1.7436186075210571,
1389
+ "logits/rejected": -1.6783809661865234,
1390
+ "logps/chosen": -356.3089294433594,
1391
+ "logps/rejected": -356.8370666503906,
1392
+ "loss": 0.6375,
1393
+ "rewards/accuracies": 0.6625000238418579,
1394
+ "rewards/chosen": -0.3216833472251892,
1395
+ "rewards/margins": 0.15957646071910858,
1396
+ "rewards/rejected": -0.4812597632408142,
1397
+ "step": 890
1398
+ },
1399
+ {
1400
+ "epoch": 0.94,
1401
+ "learning_rate": 5.087733228106517e-09,
1402
+ "logits/chosen": -1.7303647994995117,
1403
+ "logits/rejected": -1.5875599384307861,
1404
+ "logps/chosen": -385.02294921875,
1405
+ "logps/rejected": -370.8409729003906,
1406
+ "loss": 0.6357,
1407
+ "rewards/accuracies": 0.675000011920929,
1408
+ "rewards/chosen": -0.29238641262054443,
1409
+ "rewards/margins": 0.15909823775291443,
1410
+ "rewards/rejected": -0.451484739780426,
1411
+ "step": 900
1412
+ },
1413
+ {
1414
+ "epoch": 0.94,
1415
+ "eval_logits/chosen": -1.8041099309921265,
1416
+ "eval_logits/rejected": -1.6730901002883911,
1417
+ "eval_logps/chosen": -373.1398620605469,
1418
+ "eval_logps/rejected": -354.2070617675781,
1419
+ "eval_loss": 0.6285176873207092,
1420
+ "eval_rewards/accuracies": 0.6805555820465088,
1421
+ "eval_rewards/chosen": -0.30501797795295715,
1422
+ "eval_rewards/margins": 0.23024281859397888,
1423
+ "eval_rewards/rejected": -0.535260796546936,
1424
+ "eval_runtime": 333.5177,
1425
+ "eval_samples_per_second": 5.997,
1426
+ "eval_steps_per_second": 0.189,
1427
+ "step": 900
1428
+ },
1429
+ {
1430
+ "epoch": 0.95,
1431
+ "learning_rate": 3.4096741493194193e-09,
1432
+ "logits/chosen": -1.7266037464141846,
1433
+ "logits/rejected": -1.596879482269287,
1434
+ "logps/chosen": -330.7945556640625,
1435
+ "logps/rejected": -328.31414794921875,
1436
+ "loss": 0.6192,
1437
+ "rewards/accuracies": 0.675000011920929,
1438
+ "rewards/chosen": -0.3427024781703949,
1439
+ "rewards/margins": 0.17462757229804993,
1440
+ "rewards/rejected": -0.5173300504684448,
1441
+ "step": 910
1442
+ },
1443
+ {
1444
+ "epoch": 0.96,
1445
+ "learning_rate": 2.064500424599436e-09,
1446
+ "logits/chosen": -1.789232611656189,
1447
+ "logits/rejected": -1.6246925592422485,
1448
+ "logps/chosen": -370.4477233886719,
1449
+ "logps/rejected": -351.1963806152344,
1450
+ "loss": 0.6148,
1451
+ "rewards/accuracies": 0.731249988079071,
1452
+ "rewards/chosen": -0.2979881167411804,
1453
+ "rewards/margins": 0.2658146917819977,
1454
+ "rewards/rejected": -0.5638028383255005,
1455
+ "step": 920
1456
+ },
1457
+ {
1458
+ "epoch": 0.97,
1459
+ "learning_rate": 1.0540279752731252e-09,
1460
+ "logits/chosen": -1.856554388999939,
1461
+ "logits/rejected": -1.6683080196380615,
1462
+ "logps/chosen": -354.5975036621094,
1463
+ "logps/rejected": -335.6180725097656,
1464
+ "loss": 0.6047,
1465
+ "rewards/accuracies": 0.7562500238418579,
1466
+ "rewards/chosen": -0.24988925457000732,
1467
+ "rewards/margins": 0.27003243565559387,
1468
+ "rewards/rejected": -0.5199216604232788,
1469
+ "step": 930
1470
+ },
1471
+ {
1472
+ "epoch": 0.98,
1473
+ "learning_rate": 3.7962089167095645e-10,
1474
+ "logits/chosen": -1.79756760597229,
1475
+ "logits/rejected": -1.610603928565979,
1476
+ "logps/chosen": -409.74462890625,
1477
+ "logps/rejected": -392.2386169433594,
1478
+ "loss": 0.6098,
1479
+ "rewards/accuracies": 0.7437499761581421,
1480
+ "rewards/chosen": -0.29289019107818604,
1481
+ "rewards/margins": 0.28694167733192444,
1482
+ "rewards/rejected": -0.5798318386077881,
1483
+ "step": 940
1484
+ },
1485
+ {
1486
+ "epoch": 0.99,
1487
+ "learning_rate": 4.2189591669322674e-11,
1488
+ "logits/chosen": -1.8283344507217407,
1489
+ "logits/rejected": -1.6751607656478882,
1490
+ "logps/chosen": -369.1002197265625,
1491
+ "logps/rejected": -350.08013916015625,
1492
+ "loss": 0.621,
1493
+ "rewards/accuracies": 0.71875,
1494
+ "rewards/chosen": -0.29409345984458923,
1495
+ "rewards/margins": 0.25061991810798645,
1496
+ "rewards/rejected": -0.5447134375572205,
1497
+ "step": 950
1498
+ },
1499
+ {
1500
+ "epoch": 1.0,
1501
+ "step": 955,
1502
+ "total_flos": 0.0,
1503
+ "train_loss": 0.6456358959537526,
1504
+ "train_runtime": 17140.5232,
1505
+ "train_samples_per_second": 3.567,
1506
+ "train_steps_per_second": 0.056
1507
+ }
1508
+ ],
1509
+ "logging_steps": 10,
1510
+ "max_steps": 955,
1511
+ "num_input_tokens_seen": 0,
1512
+ "num_train_epochs": 1,
1513
+ "save_steps": 100,
1514
+ "total_flos": 0.0,
1515
+ "train_batch_size": 8,
1516
+ "trial_name": null,
1517
+ "trial_params": null
1518
+ }