BraylonDash commited on
Commit
415f17b
1 Parent(s): 146c680

Model save

Browse files
README.md ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ base_model: microsoft/phi-2
9
+ model-index:
10
+ - name: phi-2-ipo-test-iter-0
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # phi-2-ipo-test-iter-0
18
+
19
+ This model is a fine-tuned version of [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) on the None dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 2546.4375
22
+ - Rewards/chosen: -0.1591
23
+ - Rewards/rejected: -0.1612
24
+ - Rewards/accuracies: 0.5220
25
+ - Rewards/margins: 0.0021
26
+ - Logps/rejected: -249.6534
27
+ - Logps/chosen: -272.5227
28
+ - Logits/rejected: 0.4171
29
+ - Logits/chosen: 0.3526
30
+
31
+ ## Model description
32
+
33
+ More information needed
34
+
35
+ ## Intended uses & limitations
36
+
37
+ More information needed
38
+
39
+ ## Training and evaluation data
40
+
41
+ More information needed
42
+
43
+ ## Training procedure
44
+
45
+ ### Training hyperparameters
46
+
47
+ The following hyperparameters were used during training:
48
+ - learning_rate: 5e-06
49
+ - train_batch_size: 4
50
+ - eval_batch_size: 4
51
+ - seed: 42
52
+ - distributed_type: multi-GPU
53
+ - gradient_accumulation_steps: 4
54
+ - total_train_batch_size: 16
55
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
56
+ - lr_scheduler_type: cosine
57
+ - lr_scheduler_warmup_ratio: 0.1
58
+ - num_epochs: 4
59
+
60
+ ### Training results
61
+
62
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
63
+ |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
64
+ | 2477.3281 | 0.32 | 100 | 2500.7156 | -0.0018 | -0.0018 | 0.4930 | -0.0000 | -233.7207 | -256.7978 | 0.8796 | 0.8221 |
65
+ | 2224.3488 | 0.64 | 200 | 2499.8904 | -0.0195 | -0.0198 | 0.5015 | 0.0003 | -235.5204 | -258.5673 | 0.8051 | 0.7462 |
66
+ | 1898.0719 | 0.96 | 300 | 2505.6912 | -0.0563 | -0.0571 | 0.5140 | 0.0008 | -239.2530 | -262.2491 | 0.6844 | 0.6233 |
67
+ | 1879.8852 | 1.28 | 400 | 2516.0835 | -0.0944 | -0.0957 | 0.5200 | 0.0013 | -243.1053 | -266.0533 | 0.5839 | 0.5215 |
68
+ | 1917.2811 | 1.6 | 500 | 2527.1995 | -0.1156 | -0.1170 | 0.5115 | 0.0014 | -245.2343 | -268.1747 | 0.5244 | 0.4611 |
69
+ | 1799.3824 | 1.92 | 600 | 2534.4292 | -0.1363 | -0.1381 | 0.5210 | 0.0018 | -247.3504 | -270.2482 | 0.4714 | 0.4075 |
70
+ | 1751.5762 | 2.24 | 700 | 2531.3550 | -0.1448 | -0.1474 | 0.5180 | 0.0026 | -248.2780 | -271.0988 | 0.4545 | 0.3906 |
71
+ | 1711.1711 | 2.56 | 800 | 2536.2451 | -0.1487 | -0.1511 | 0.5145 | 0.0024 | -248.6440 | -271.4834 | 0.4402 | 0.3759 |
72
+ | 1894.4447 | 2.88 | 900 | 2542.6299 | -0.1549 | -0.1570 | 0.5235 | 0.0022 | -249.2417 | -272.1000 | 0.4262 | 0.3618 |
73
+ | 1798.5389 | 3.2 | 1000 | 2542.7288 | -0.1581 | -0.1604 | 0.5205 | 0.0023 | -249.5780 | -272.4200 | 0.4202 | 0.3559 |
74
+ | 1834.9711 | 3.52 | 1100 | 2542.2373 | -0.1586 | -0.1610 | 0.5205 | 0.0024 | -249.6345 | -272.4703 | 0.4177 | 0.3532 |
75
+ | 1765.5148 | 3.84 | 1200 | 2546.1714 | -0.1589 | -0.1610 | 0.5220 | 0.0021 | -249.6357 | -272.5010 | 0.4160 | 0.3515 |
76
+
77
+
78
+ ### Framework versions
79
+
80
+ - PEFT 0.7.1
81
+ - Transformers 4.36.2
82
+ - Pytorch 2.2.1+cu121
83
+ - Datasets 2.14.6
84
+ - Tokenizers 0.15.2
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:66fd2df57d99200948333dab1ba44500e7fb32cf66504b7ab697d5ccb72aaa09
3
  size 41977616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5c1e67946b196556debdce64a2a65e344ae1ad85cba635e5b08b0a06b1003e4
3
  size 41977616
all_results.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.99,
3
+ "eval_logits/chosen": 0.3525713086128235,
4
+ "eval_logits/rejected": 0.4171118438243866,
5
+ "eval_logps/chosen": -272.5227355957031,
6
+ "eval_logps/rejected": -249.6534423828125,
7
+ "eval_loss": 2546.4375,
8
+ "eval_rewards/accuracies": 0.5220000147819519,
9
+ "eval_rewards/chosen": -0.15908558666706085,
10
+ "eval_rewards/margins": 0.0020662047900259495,
11
+ "eval_rewards/rejected": -0.16115178167819977,
12
+ "eval_runtime": 411.7057,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 4.858,
15
+ "eval_steps_per_second": 1.214,
16
+ "train_loss": 1918.4646684695513,
17
+ "train_runtime": 14252.7073,
18
+ "train_samples": 30567,
19
+ "train_samples_per_second": 1.403,
20
+ "train_steps_per_second": 0.088
21
+ }
eval_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.99,
3
+ "eval_logits/chosen": 0.3525713086128235,
4
+ "eval_logits/rejected": 0.4171118438243866,
5
+ "eval_logps/chosen": -272.5227355957031,
6
+ "eval_logps/rejected": -249.6534423828125,
7
+ "eval_loss": 2546.4375,
8
+ "eval_rewards/accuracies": 0.5220000147819519,
9
+ "eval_rewards/chosen": -0.15908558666706085,
10
+ "eval_rewards/margins": 0.0020662047900259495,
11
+ "eval_rewards/rejected": -0.16115178167819977,
12
+ "eval_runtime": 411.7057,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 4.858,
15
+ "eval_steps_per_second": 1.214
16
+ }
runs/Mar30_15-30-11_Braylon/events.out.tfevents.1711773088.Braylon.21497.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5aee622b75a103ad42e6bea70d987ce263ca3cd615f761b507fd5b614033ac24
3
- size 90250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcefbd3d4afe4692920068afa1be5a0b979d01eb00227742305862b7f1dfd8ae
3
+ size 93140
runs/Mar30_15-30-11_Braylon/events.out.tfevents.1711787753.Braylon.21497.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15ca0c62afb04058b08608cbc96562a5e226f7f6122a4dd3668d55f4f0f0cd9f
3
+ size 828
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.99,
3
+ "train_loss": 1918.4646684695513,
4
+ "train_runtime": 14252.7073,
5
+ "train_samples": 30567,
6
+ "train_samples_per_second": 1.403,
7
+ "train_steps_per_second": 0.088
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1972 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.9936,
5
+ "eval_steps": 100,
6
+ "global_step": 1248,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 4e-08,
14
+ "logits/chosen": 0.7221256494522095,
15
+ "logits/rejected": 0.8745549917221069,
16
+ "logps/chosen": -277.6833801269531,
17
+ "logps/rejected": -189.9869384765625,
18
+ "loss": 2500.0,
19
+ "rewards/accuracies": 0.0,
20
+ "rewards/chosen": 0.0,
21
+ "rewards/margins": 0.0,
22
+ "rewards/rejected": 0.0,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 0.03,
27
+ "learning_rate": 4.0000000000000003e-07,
28
+ "logits/chosen": 0.6776503324508667,
29
+ "logits/rejected": 0.7872164845466614,
30
+ "logps/chosen": -273.4142761230469,
31
+ "logps/rejected": -216.78836059570312,
32
+ "loss": 2505.8785,
33
+ "rewards/accuracies": 0.3888888955116272,
34
+ "rewards/chosen": 0.00015669333515688777,
35
+ "rewards/margins": -0.0005680065951310098,
36
+ "rewards/rejected": 0.0007246998138725758,
37
+ "step": 10
38
+ },
39
+ {
40
+ "epoch": 0.06,
41
+ "learning_rate": 8.000000000000001e-07,
42
+ "logits/chosen": 0.6071363687515259,
43
+ "logits/rejected": 0.8961409330368042,
44
+ "logps/chosen": -252.6411590576172,
45
+ "logps/rejected": -188.6440887451172,
46
+ "loss": 2498.5875,
47
+ "rewards/accuracies": 0.48750001192092896,
48
+ "rewards/chosen": 0.0007678332040086389,
49
+ "rewards/margins": 0.00017280881002079695,
50
+ "rewards/rejected": 0.0005950243212282658,
51
+ "step": 20
52
+ },
53
+ {
54
+ "epoch": 0.1,
55
+ "learning_rate": 1.2000000000000002e-06,
56
+ "logits/chosen": 0.5971012711524963,
57
+ "logits/rejected": 0.7454315423965454,
58
+ "logps/chosen": -272.0901184082031,
59
+ "logps/rejected": -205.0573272705078,
60
+ "loss": 2504.0305,
61
+ "rewards/accuracies": 0.44999998807907104,
62
+ "rewards/chosen": -0.00030031849746592343,
63
+ "rewards/margins": -0.0003763613640330732,
64
+ "rewards/rejected": 7.604288111906499e-05,
65
+ "step": 30
66
+ },
67
+ {
68
+ "epoch": 0.13,
69
+ "learning_rate": 1.6000000000000001e-06,
70
+ "logits/chosen": 0.5639680027961731,
71
+ "logits/rejected": 0.8156954646110535,
72
+ "logps/chosen": -254.08987426757812,
73
+ "logps/rejected": -199.48912048339844,
74
+ "loss": 2499.0203,
75
+ "rewards/accuracies": 0.5062500238418579,
76
+ "rewards/chosen": -0.00016066078387666494,
77
+ "rewards/margins": 0.00013786301133222878,
78
+ "rewards/rejected": -0.00029852380976080894,
79
+ "step": 40
80
+ },
81
+ {
82
+ "epoch": 0.16,
83
+ "learning_rate": 2.0000000000000003e-06,
84
+ "logits/chosen": 0.6404772996902466,
85
+ "logits/rejected": 0.8446690440177917,
86
+ "logps/chosen": -264.58636474609375,
87
+ "logps/rejected": -211.51986694335938,
88
+ "loss": 2499.9363,
89
+ "rewards/accuracies": 0.46875,
90
+ "rewards/chosen": 0.0005515815573744476,
91
+ "rewards/margins": 3.0463817893178202e-05,
92
+ "rewards/rejected": 0.0005211178213357925,
93
+ "step": 50
94
+ },
95
+ {
96
+ "epoch": 0.19,
97
+ "learning_rate": 2.4000000000000003e-06,
98
+ "logits/chosen": 0.693733274936676,
99
+ "logits/rejected": 0.8246955871582031,
100
+ "logps/chosen": -266.50872802734375,
101
+ "logps/rejected": -209.1096649169922,
102
+ "loss": 2498.2361,
103
+ "rewards/accuracies": 0.4375,
104
+ "rewards/chosen": 0.0005181363085284829,
105
+ "rewards/margins": 0.00020206482440698892,
106
+ "rewards/rejected": 0.0003160713822580874,
107
+ "step": 60
108
+ },
109
+ {
110
+ "epoch": 0.22,
111
+ "learning_rate": 2.8000000000000003e-06,
112
+ "logits/chosen": 0.6724094152450562,
113
+ "logits/rejected": 0.8244352340698242,
114
+ "logps/chosen": -256.1933898925781,
115
+ "logps/rejected": -198.73048400878906,
116
+ "loss": 2492.3055,
117
+ "rewards/accuracies": 0.518750011920929,
118
+ "rewards/chosen": 0.00043690926395356655,
119
+ "rewards/margins": 0.0007959330687299371,
120
+ "rewards/rejected": -0.0003590236883610487,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.26,
125
+ "learning_rate": 3.2000000000000003e-06,
126
+ "logits/chosen": 0.6599764227867126,
127
+ "logits/rejected": 0.8170899152755737,
128
+ "logps/chosen": -273.2989807128906,
129
+ "logps/rejected": -216.25827026367188,
130
+ "loss": 2488.535,
131
+ "rewards/accuracies": 0.4937500059604645,
132
+ "rewards/chosen": 0.0022243678104132414,
133
+ "rewards/margins": 0.0011816672049462795,
134
+ "rewards/rejected": 0.0010427006054669619,
135
+ "step": 80
136
+ },
137
+ {
138
+ "epoch": 0.29,
139
+ "learning_rate": 3.6000000000000003e-06,
140
+ "logits/chosen": 0.6013220548629761,
141
+ "logits/rejected": 0.8273889422416687,
142
+ "logps/chosen": -268.2584533691406,
143
+ "logps/rejected": -213.9058380126953,
144
+ "loss": 2482.6732,
145
+ "rewards/accuracies": 0.574999988079071,
146
+ "rewards/chosen": 0.003236656542867422,
147
+ "rewards/margins": 0.0017823975067585707,
148
+ "rewards/rejected": 0.001454259268939495,
149
+ "step": 90
150
+ },
151
+ {
152
+ "epoch": 0.32,
153
+ "learning_rate": 4.000000000000001e-06,
154
+ "logits/chosen": 0.6121121644973755,
155
+ "logits/rejected": 0.8102381825447083,
156
+ "logps/chosen": -258.34576416015625,
157
+ "logps/rejected": -212.81478881835938,
158
+ "loss": 2477.3281,
159
+ "rewards/accuracies": 0.543749988079071,
160
+ "rewards/chosen": 0.0035489716101437807,
161
+ "rewards/margins": 0.0023237697314471006,
162
+ "rewards/rejected": 0.001225201180204749,
163
+ "step": 100
164
+ },
165
+ {
166
+ "epoch": 0.32,
167
+ "eval_logits/chosen": 0.8220780491828918,
168
+ "eval_logits/rejected": 0.8795672655105591,
169
+ "eval_logps/chosen": -256.7978210449219,
170
+ "eval_logps/rejected": -233.720703125,
171
+ "eval_loss": 2500.715576171875,
172
+ "eval_rewards/accuracies": 0.49300000071525574,
173
+ "eval_rewards/chosen": -0.0018364518182352185,
174
+ "eval_rewards/margins": -1.2206258361402433e-05,
175
+ "eval_rewards/rejected": -0.0018242454389110208,
176
+ "eval_runtime": 442.8545,
177
+ "eval_samples_per_second": 4.516,
178
+ "eval_steps_per_second": 1.129,
179
+ "step": 100
180
+ },
181
+ {
182
+ "epoch": 0.35,
183
+ "learning_rate": 4.4e-06,
184
+ "logits/chosen": 0.6344770193099976,
185
+ "logits/rejected": 0.8470233082771301,
186
+ "logps/chosen": -257.6260986328125,
187
+ "logps/rejected": -205.66622924804688,
188
+ "loss": 2466.1889,
189
+ "rewards/accuracies": 0.6000000238418579,
190
+ "rewards/chosen": 0.005115890875458717,
191
+ "rewards/margins": 0.003451352473348379,
192
+ "rewards/rejected": 0.0016645386349409819,
193
+ "step": 110
194
+ },
195
+ {
196
+ "epoch": 0.38,
197
+ "learning_rate": 4.800000000000001e-06,
198
+ "logits/chosen": 0.6225894689559937,
199
+ "logits/rejected": 0.7519047856330872,
200
+ "logps/chosen": -273.39776611328125,
201
+ "logps/rejected": -211.7797088623047,
202
+ "loss": 2446.4322,
203
+ "rewards/accuracies": 0.6312500238418579,
204
+ "rewards/chosen": 0.007990755140781403,
205
+ "rewards/margins": 0.005494705401360989,
206
+ "rewards/rejected": 0.0024960506707429886,
207
+ "step": 120
208
+ },
209
+ {
210
+ "epoch": 0.42,
211
+ "learning_rate": 4.999755441268144e-06,
212
+ "logits/chosen": 0.5374079942703247,
213
+ "logits/rejected": 0.8167151212692261,
214
+ "logps/chosen": -283.385498046875,
215
+ "logps/rejected": -228.09716796875,
216
+ "loss": 2434.1764,
217
+ "rewards/accuracies": 0.606249988079071,
218
+ "rewards/chosen": 0.010817321017384529,
219
+ "rewards/margins": 0.006822638213634491,
220
+ "rewards/rejected": 0.003994682338088751,
221
+ "step": 130
222
+ },
223
+ {
224
+ "epoch": 0.45,
225
+ "learning_rate": 4.997799258487003e-06,
226
+ "logits/chosen": 0.6246897578239441,
227
+ "logits/rejected": 0.8092406988143921,
228
+ "logps/chosen": -273.5729064941406,
229
+ "logps/rejected": -220.0849609375,
230
+ "loss": 2406.7561,
231
+ "rewards/accuracies": 0.606249988079071,
232
+ "rewards/chosen": 0.013732816092669964,
233
+ "rewards/margins": 0.009700324386358261,
234
+ "rewards/rejected": 0.004032492637634277,
235
+ "step": 140
236
+ },
237
+ {
238
+ "epoch": 0.48,
239
+ "learning_rate": 4.993888423734898e-06,
240
+ "logits/chosen": 0.6373761892318726,
241
+ "logits/rejected": 0.7924760580062866,
242
+ "logps/chosen": -259.81890869140625,
243
+ "logps/rejected": -198.0768585205078,
244
+ "loss": 2378.9869,
245
+ "rewards/accuracies": 0.668749988079071,
246
+ "rewards/chosen": 0.017864007502794266,
247
+ "rewards/margins": 0.01282494980841875,
248
+ "rewards/rejected": 0.00503905676305294,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.51,
253
+ "learning_rate": 4.988025997434253e-06,
254
+ "logits/chosen": 0.6211603283882141,
255
+ "logits/rejected": 0.7996471524238586,
256
+ "logps/chosen": -262.85723876953125,
257
+ "logps/rejected": -209.2670135498047,
258
+ "loss": 2347.3615,
259
+ "rewards/accuracies": 0.65625,
260
+ "rewards/chosen": 0.02218160405755043,
261
+ "rewards/margins": 0.016304031014442444,
262
+ "rewards/rejected": 0.005877572111785412,
263
+ "step": 160
264
+ },
265
+ {
266
+ "epoch": 0.54,
267
+ "learning_rate": 4.980216567224801e-06,
268
+ "logits/chosen": 0.5590766668319702,
269
+ "logits/rejected": 0.8358405232429504,
270
+ "logps/chosen": -268.93450927734375,
271
+ "logps/rejected": -217.07223510742188,
272
+ "loss": 2314.8727,
273
+ "rewards/accuracies": 0.668749988079071,
274
+ "rewards/chosen": 0.027453165501356125,
275
+ "rewards/margins": 0.02034853585064411,
276
+ "rewards/rejected": 0.0071046315133571625,
277
+ "step": 170
278
+ },
279
+ {
280
+ "epoch": 0.58,
281
+ "learning_rate": 4.970466244373527e-06,
282
+ "logits/chosen": 0.5857344269752502,
283
+ "logits/rejected": 0.8025603294372559,
284
+ "logps/chosen": -269.49420166015625,
285
+ "logps/rejected": -224.668701171875,
286
+ "loss": 2323.1209,
287
+ "rewards/accuracies": 0.675000011920929,
288
+ "rewards/chosen": 0.026119906455278397,
289
+ "rewards/margins": 0.019484227523207664,
290
+ "rewards/rejected": 0.006635676138103008,
291
+ "step": 180
292
+ },
293
+ {
294
+ "epoch": 0.61,
295
+ "learning_rate": 4.958782658992307e-06,
296
+ "logits/chosen": 0.6378930807113647,
297
+ "logits/rejected": 0.7625473737716675,
298
+ "logps/chosen": -245.82064819335938,
299
+ "logps/rejected": -212.6953582763672,
300
+ "loss": 2353.0727,
301
+ "rewards/accuracies": 0.59375,
302
+ "rewards/chosen": 0.02194877900183201,
303
+ "rewards/margins": 0.01612677052617073,
304
+ "rewards/rejected": 0.0058220066130161285,
305
+ "step": 190
306
+ },
307
+ {
308
+ "epoch": 0.64,
309
+ "learning_rate": 4.945174954066957e-06,
310
+ "logits/chosen": 0.5794962048530579,
311
+ "logits/rejected": 0.7216407656669617,
312
+ "logps/chosen": -262.74481201171875,
313
+ "logps/rejected": -205.56753540039062,
314
+ "loss": 2224.3488,
315
+ "rewards/accuracies": 0.6625000238418579,
316
+ "rewards/chosen": 0.040737103670835495,
317
+ "rewards/margins": 0.03179619461297989,
318
+ "rewards/rejected": 0.008940907195210457,
319
+ "step": 200
320
+ },
321
+ {
322
+ "epoch": 0.64,
323
+ "eval_logits/chosen": 0.7462335824966431,
324
+ "eval_logits/rejected": 0.8050766587257385,
325
+ "eval_logps/chosen": -258.5672912597656,
326
+ "eval_logps/rejected": -235.52040100097656,
327
+ "eval_loss": 2499.890380859375,
328
+ "eval_rewards/accuracies": 0.5015000104904175,
329
+ "eval_rewards/chosen": -0.019531190395355225,
330
+ "eval_rewards/margins": 0.0002903530257754028,
331
+ "eval_rewards/rejected": -0.019821541383862495,
332
+ "eval_runtime": 412.3797,
333
+ "eval_samples_per_second": 4.85,
334
+ "eval_steps_per_second": 1.212,
335
+ "step": 200
336
+ },
337
+ {
338
+ "epoch": 0.67,
339
+ "learning_rate": 4.929653778302397e-06,
340
+ "logits/chosen": 0.6170880198478699,
341
+ "logits/rejected": 0.7665790915489197,
342
+ "logps/chosen": -260.6783447265625,
343
+ "logps/rejected": -205.8716278076172,
344
+ "loss": 2165.3709,
345
+ "rewards/accuracies": 0.6812499761581421,
346
+ "rewards/chosen": 0.04661153629422188,
347
+ "rewards/margins": 0.039808522909879684,
348
+ "rewards/rejected": 0.006803011987358332,
349
+ "step": 210
350
+ },
351
+ {
352
+ "epoch": 0.7,
353
+ "learning_rate": 4.912231277789509e-06,
354
+ "logits/chosen": 0.5444914102554321,
355
+ "logits/rejected": 0.7560594081878662,
356
+ "logps/chosen": -277.1501770019531,
357
+ "logps/rejected": -227.8493194580078,
358
+ "loss": 2199.4646,
359
+ "rewards/accuracies": 0.606249988079071,
360
+ "rewards/chosen": 0.05624958872795105,
361
+ "rewards/margins": 0.037255048751831055,
362
+ "rewards/rejected": 0.018994538113474846,
363
+ "step": 220
364
+ },
365
+ {
366
+ "epoch": 0.74,
367
+ "learning_rate": 4.892921086500219e-06,
368
+ "logits/chosen": 0.5499123930931091,
369
+ "logits/rejected": 0.6719511151313782,
370
+ "logps/chosen": -243.00390625,
371
+ "logps/rejected": -205.0264434814453,
372
+ "loss": 2218.1998,
373
+ "rewards/accuracies": 0.6187499761581421,
374
+ "rewards/chosen": 0.04481152817606926,
375
+ "rewards/margins": 0.03403898701071739,
376
+ "rewards/rejected": 0.010772541165351868,
377
+ "step": 230
378
+ },
379
+ {
380
+ "epoch": 0.77,
381
+ "learning_rate": 4.871738315618236e-06,
382
+ "logits/chosen": 0.5359088182449341,
383
+ "logits/rejected": 0.7013985514640808,
384
+ "logps/chosen": -263.36322021484375,
385
+ "logps/rejected": -217.5428924560547,
386
+ "loss": 2121.8713,
387
+ "rewards/accuracies": 0.612500011920929,
388
+ "rewards/chosen": 0.06463086605072021,
389
+ "rewards/margins": 0.04686326906085014,
390
+ "rewards/rejected": 0.017767589539289474,
391
+ "step": 240
392
+ },
393
+ {
394
+ "epoch": 0.8,
395
+ "learning_rate": 4.848699541713801e-06,
396
+ "logits/chosen": 0.5078493356704712,
397
+ "logits/rejected": 0.6534050703048706,
398
+ "logps/chosen": -255.99612426757812,
399
+ "logps/rejected": -209.0691680908203,
400
+ "loss": 2179.4311,
401
+ "rewards/accuracies": 0.6499999761581421,
402
+ "rewards/chosen": 0.05505412817001343,
403
+ "rewards/margins": 0.04023104906082153,
404
+ "rewards/rejected": 0.014823079109191895,
405
+ "step": 250
406
+ },
407
+ {
408
+ "epoch": 0.83,
409
+ "learning_rate": 4.823822793771696e-06,
410
+ "logits/chosen": 0.5251200795173645,
411
+ "logits/rejected": 0.6907744407653809,
412
+ "logps/chosen": -244.97616577148438,
413
+ "logps/rejected": -193.0167694091797,
414
+ "loss": 2039.2832,
415
+ "rewards/accuracies": 0.699999988079071,
416
+ "rewards/chosen": 0.06581679731607437,
417
+ "rewards/margins": 0.05999577045440674,
418
+ "rewards/rejected": 0.005821021273732185,
419
+ "step": 260
420
+ },
421
+ {
422
+ "epoch": 0.86,
423
+ "learning_rate": 4.797127539082669e-06,
424
+ "logits/chosen": 0.5102941393852234,
425
+ "logits/rejected": 0.6884879469871521,
426
+ "logps/chosen": -267.79193115234375,
427
+ "logps/rejected": -216.37808227539062,
428
+ "loss": 2084.2711,
429
+ "rewards/accuracies": 0.6312500238418579,
430
+ "rewards/chosen": 0.06373900920152664,
431
+ "rewards/margins": 0.05440465360879898,
432
+ "rewards/rejected": 0.009334356524050236,
433
+ "step": 270
434
+ },
435
+ {
436
+ "epoch": 0.9,
437
+ "learning_rate": 4.7686346680093135e-06,
438
+ "logits/chosen": 0.46306952834129333,
439
+ "logits/rejected": 0.6997040510177612,
440
+ "logps/chosen": -248.92501831054688,
441
+ "logps/rejected": -192.95266723632812,
442
+ "loss": 1956.5348,
443
+ "rewards/accuracies": 0.6812499761581421,
444
+ "rewards/chosen": 0.08462236821651459,
445
+ "rewards/margins": 0.07454703748226166,
446
+ "rewards/rejected": 0.01007532887160778,
447
+ "step": 280
448
+ },
449
+ {
450
+ "epoch": 0.93,
451
+ "learning_rate": 4.738366477638319e-06,
452
+ "logits/chosen": 0.49755996465682983,
453
+ "logits/rejected": 0.6823209524154663,
454
+ "logps/chosen": -252.31265258789062,
455
+ "logps/rejected": -203.94357299804688,
456
+ "loss": 2082.6836,
457
+ "rewards/accuracies": 0.6187499761581421,
458
+ "rewards/chosen": 0.060582734644412994,
459
+ "rewards/margins": 0.0560123547911644,
460
+ "rewards/rejected": 0.004570374730974436,
461
+ "step": 290
462
+ },
463
+ {
464
+ "epoch": 0.96,
465
+ "learning_rate": 4.7063466543318965e-06,
466
+ "logits/chosen": 0.40442362427711487,
467
+ "logits/rejected": 0.643377423286438,
468
+ "logps/chosen": -253.7049102783203,
469
+ "logps/rejected": -207.00146484375,
470
+ "loss": 1898.0719,
471
+ "rewards/accuracies": 0.6875,
472
+ "rewards/chosen": 0.09280969202518463,
473
+ "rewards/margins": 0.08169680833816528,
474
+ "rewards/rejected": 0.011112888343632221,
475
+ "step": 300
476
+ },
477
+ {
478
+ "epoch": 0.96,
479
+ "eval_logits/chosen": 0.6232779622077942,
480
+ "eval_logits/rejected": 0.6844168305397034,
481
+ "eval_logps/chosen": -262.24908447265625,
482
+ "eval_logps/rejected": -239.25296020507812,
483
+ "eval_loss": 2505.691162109375,
484
+ "eval_rewards/accuracies": 0.5139999985694885,
485
+ "eval_rewards/chosen": -0.05634931102395058,
486
+ "eval_rewards/margins": 0.0007975505432114005,
487
+ "eval_rewards/rejected": -0.057146865874528885,
488
+ "eval_runtime": 411.7351,
489
+ "eval_samples_per_second": 4.857,
490
+ "eval_steps_per_second": 1.214,
491
+ "step": 300
492
+ },
493
+ {
494
+ "epoch": 0.99,
495
+ "learning_rate": 4.672600255192022e-06,
496
+ "logits/chosen": 0.4478415846824646,
497
+ "logits/rejected": 0.6094005703926086,
498
+ "logps/chosen": -280.0593566894531,
499
+ "logps/rejected": -230.06820678710938,
500
+ "loss": 1902.6814,
501
+ "rewards/accuracies": 0.699999988079071,
502
+ "rewards/chosen": 0.08015590906143188,
503
+ "rewards/margins": 0.08636869490146637,
504
+ "rewards/rejected": -0.006212792359292507,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 1.02,
509
+ "learning_rate": 4.6371536884520115e-06,
510
+ "logits/chosen": 0.4466761648654938,
511
+ "logits/rejected": 0.5766214728355408,
512
+ "logps/chosen": -260.2146911621094,
513
+ "logps/rejected": -202.6782989501953,
514
+ "loss": 1965.6586,
515
+ "rewards/accuracies": 0.6625000238418579,
516
+ "rewards/chosen": 0.08317091315984726,
517
+ "rewards/margins": 0.07663293927907944,
518
+ "rewards/rejected": 0.006537970155477524,
519
+ "step": 320
520
+ },
521
+ {
522
+ "epoch": 1.06,
523
+ "learning_rate": 4.600034692810764e-06,
524
+ "logits/chosen": 0.4064536690711975,
525
+ "logits/rejected": 0.6527116894721985,
526
+ "logps/chosen": -285.71063232421875,
527
+ "logps/rejected": -232.9168243408203,
528
+ "loss": 1807.0604,
529
+ "rewards/accuracies": 0.675000011920929,
530
+ "rewards/chosen": 0.1063646450638771,
531
+ "rewards/margins": 0.10833654552698135,
532
+ "rewards/rejected": -0.001971897669136524,
533
+ "step": 330
534
+ },
535
+ {
536
+ "epoch": 1.09,
537
+ "learning_rate": 4.561272315725852e-06,
538
+ "logits/chosen": 0.39116889238357544,
539
+ "logits/rejected": 0.5679996609687805,
540
+ "logps/chosen": -269.14727783203125,
541
+ "logps/rejected": -226.08987426757812,
542
+ "loss": 1983.8736,
543
+ "rewards/accuracies": 0.675000011920929,
544
+ "rewards/chosen": 0.09996851533651352,
545
+ "rewards/margins": 0.08038529753684998,
546
+ "rewards/rejected": 0.019583214074373245,
547
+ "step": 340
548
+ },
549
+ {
550
+ "epoch": 1.12,
551
+ "learning_rate": 4.520896890682449e-06,
552
+ "logits/chosen": 0.3980609178543091,
553
+ "logits/rejected": 0.5835639834403992,
554
+ "logps/chosen": -261.222412109375,
555
+ "logps/rejected": -211.2742156982422,
556
+ "loss": 1903.2615,
557
+ "rewards/accuracies": 0.625,
558
+ "rewards/chosen": 0.11096767336130142,
559
+ "rewards/margins": 0.10369744151830673,
560
+ "rewards/rejected": 0.007270221598446369,
561
+ "step": 350
562
+ },
563
+ {
564
+ "epoch": 1.15,
565
+ "learning_rate": 4.478940013455864e-06,
566
+ "logits/chosen": 0.32668036222457886,
567
+ "logits/rejected": 0.642693042755127,
568
+ "logps/chosen": -248.02194213867188,
569
+ "logps/rejected": -195.903564453125,
570
+ "loss": 1819.0176,
571
+ "rewards/accuracies": 0.675000011920929,
572
+ "rewards/chosen": 0.11030755937099457,
573
+ "rewards/margins": 0.10598069429397583,
574
+ "rewards/rejected": 0.004326865542680025,
575
+ "step": 360
576
+ },
577
+ {
578
+ "epoch": 1.18,
579
+ "learning_rate": 4.435434517386281e-06,
580
+ "logits/chosen": 0.40880435705184937,
581
+ "logits/rejected": 0.6121729612350464,
582
+ "logps/chosen": -256.9469909667969,
583
+ "logps/rejected": -210.01284790039062,
584
+ "loss": 1875.7564,
585
+ "rewards/accuracies": 0.65625,
586
+ "rewards/chosen": 0.10248272120952606,
587
+ "rewards/margins": 0.09737477451562881,
588
+ "rewards/rejected": 0.005107959732413292,
589
+ "step": 370
590
+ },
591
+ {
592
+ "epoch": 1.22,
593
+ "learning_rate": 4.39041444768504e-06,
594
+ "logits/chosen": 0.35375842452049255,
595
+ "logits/rejected": 0.5443102121353149,
596
+ "logps/chosen": -271.12091064453125,
597
+ "logps/rejected": -223.96395874023438,
598
+ "loss": 1840.3432,
599
+ "rewards/accuracies": 0.6812499761581421,
600
+ "rewards/chosen": 0.10606244951486588,
601
+ "rewards/margins": 0.11150339990854263,
602
+ "rewards/rejected": -0.005440945271402597,
603
+ "step": 380
604
+ },
605
+ {
606
+ "epoch": 1.25,
607
+ "learning_rate": 4.343915034792569e-06,
608
+ "logits/chosen": 0.33456122875213623,
609
+ "logits/rejected": 0.6237267255783081,
610
+ "logps/chosen": -248.20458984375,
611
+ "logps/rejected": -200.05455017089844,
612
+ "loss": 1773.2205,
613
+ "rewards/accuracies": 0.6875,
614
+ "rewards/chosen": 0.11527778208255768,
615
+ "rewards/margins": 0.11671481281518936,
616
+ "rewards/rejected": -0.0014370165299624205,
617
+ "step": 390
618
+ },
619
+ {
620
+ "epoch": 1.28,
621
+ "learning_rate": 4.295972666808811e-06,
622
+ "logits/chosen": 0.3166826665401459,
623
+ "logits/rejected": 0.5174378156661987,
624
+ "logps/chosen": -285.5495910644531,
625
+ "logps/rejected": -236.46630859375,
626
+ "loss": 1879.8852,
627
+ "rewards/accuracies": 0.5874999761581421,
628
+ "rewards/chosen": 0.10617715120315552,
629
+ "rewards/margins": 0.1061965599656105,
630
+ "rewards/rejected": -1.941286063811276e-05,
631
+ "step": 400
632
+ },
633
+ {
634
+ "epoch": 1.28,
635
+ "eval_logits/chosen": 0.5215258598327637,
636
+ "eval_logits/rejected": 0.5839402079582214,
637
+ "eval_logps/chosen": -266.0533447265625,
638
+ "eval_logps/rejected": -243.10528564453125,
639
+ "eval_loss": 2516.08349609375,
640
+ "eval_rewards/accuracies": 0.5199999809265137,
641
+ "eval_rewards/chosen": -0.09439140558242798,
642
+ "eval_rewards/margins": 0.001278651412576437,
643
+ "eval_rewards/rejected": -0.09567005187273026,
644
+ "eval_runtime": 412.4154,
645
+ "eval_samples_per_second": 4.849,
646
+ "eval_steps_per_second": 1.212,
647
+ "step": 400
648
+ },
649
+ {
650
+ "epoch": 1.31,
651
+ "learning_rate": 4.246624861017732e-06,
652
+ "logits/chosen": 0.31735214591026306,
653
+ "logits/rejected": 0.45617708563804626,
654
+ "logps/chosen": -242.1129608154297,
655
+ "logps/rejected": -200.98440551757812,
656
+ "loss": 1814.2686,
657
+ "rewards/accuracies": 0.6812499761581421,
658
+ "rewards/chosen": 0.13390564918518066,
659
+ "rewards/margins": 0.11606212705373764,
660
+ "rewards/rejected": 0.01784351095557213,
661
+ "step": 410
662
+ },
663
+ {
664
+ "epoch": 1.34,
665
+ "learning_rate": 4.195910234528186e-06,
666
+ "logits/chosen": 0.34283334016799927,
667
+ "logits/rejected": 0.532746434211731,
668
+ "logps/chosen": -250.3861541748047,
669
+ "logps/rejected": -209.1632843017578,
670
+ "loss": 1950.3166,
671
+ "rewards/accuracies": 0.550000011920929,
672
+ "rewards/chosen": 0.08673225343227386,
673
+ "rewards/margins": 0.09020966291427612,
674
+ "rewards/rejected": -0.003477415069937706,
675
+ "step": 420
676
+ },
677
+ {
678
+ "epoch": 1.38,
679
+ "learning_rate": 4.143868474054098e-06,
680
+ "logits/chosen": 0.2972554564476013,
681
+ "logits/rejected": 0.4938802123069763,
682
+ "logps/chosen": -245.431640625,
683
+ "logps/rejected": -195.8451385498047,
684
+ "loss": 1783.8299,
685
+ "rewards/accuracies": 0.675000011920929,
686
+ "rewards/chosen": 0.1252884864807129,
687
+ "rewards/margins": 0.11878180503845215,
688
+ "rewards/rejected": 0.006506689824163914,
689
+ "step": 430
690
+ },
691
+ {
692
+ "epoch": 1.41,
693
+ "learning_rate": 4.0905403048576545e-06,
694
+ "logits/chosen": 0.3040269911289215,
695
+ "logits/rejected": 0.5599544048309326,
696
+ "logps/chosen": -254.08749389648438,
697
+ "logps/rejected": -213.42941284179688,
698
+ "loss": 1878.9361,
699
+ "rewards/accuracies": 0.6499999761581421,
700
+ "rewards/chosen": 0.1011713370680809,
701
+ "rewards/margins": 0.10365436226129532,
702
+ "rewards/rejected": -0.002483018906787038,
703
+ "step": 440
704
+ },
705
+ {
706
+ "epoch": 1.44,
707
+ "learning_rate": 4.035967458879751e-06,
708
+ "logits/chosen": 0.29073578119277954,
709
+ "logits/rejected": 0.4535519480705261,
710
+ "logps/chosen": -226.4980926513672,
711
+ "logps/rejected": -191.19338989257812,
712
+ "loss": 1882.7074,
713
+ "rewards/accuracies": 0.706250011920929,
714
+ "rewards/chosen": 0.11003535985946655,
715
+ "rewards/margins": 0.10973072052001953,
716
+ "rewards/rejected": 0.000304625544231385,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 1.47,
721
+ "learning_rate": 3.980192642082682e-06,
722
+ "logits/chosen": 0.27826622128486633,
723
+ "logits/rejected": 0.4509051740169525,
724
+ "logps/chosen": -246.6437225341797,
725
+ "logps/rejected": -199.22531127929688,
726
+ "loss": 1963.9541,
727
+ "rewards/accuracies": 0.637499988079071,
728
+ "rewards/chosen": 0.10869649797677994,
729
+ "rewards/margins": 0.10766948759555817,
730
+ "rewards/rejected": 0.0010270171333104372,
731
+ "step": 460
732
+ },
733
+ {
734
+ "epoch": 1.5,
735
+ "learning_rate": 3.923259501030604e-06,
736
+ "logits/chosen": 0.24556589126586914,
737
+ "logits/rejected": 0.4926172196865082,
738
+ "logps/chosen": -247.1698455810547,
739
+ "logps/rejected": -208.8766326904297,
740
+ "loss": 1971.4818,
741
+ "rewards/accuracies": 0.625,
742
+ "rewards/chosen": 0.0876002386212349,
743
+ "rewards/margins": 0.08931747823953629,
744
+ "rewards/rejected": -0.0017172384541481733,
745
+ "step": 470
746
+ },
747
+ {
748
+ "epoch": 1.54,
749
+ "learning_rate": 3.865212588733927e-06,
750
+ "logits/chosen": 0.3068960905075073,
751
+ "logits/rejected": 0.515642523765564,
752
+ "logps/chosen": -253.49880981445312,
753
+ "logps/rejected": -201.22650146484375,
754
+ "loss": 1692.4139,
755
+ "rewards/accuracies": 0.6812499761581421,
756
+ "rewards/chosen": 0.12761953473091125,
757
+ "rewards/margins": 0.15515872836112976,
758
+ "rewards/rejected": -0.027539223432540894,
759
+ "step": 480
760
+ },
761
+ {
762
+ "epoch": 1.57,
763
+ "learning_rate": 3.8060973297843773e-06,
764
+ "logits/chosen": 0.2675209045410156,
765
+ "logits/rejected": 0.4657977521419525,
766
+ "logps/chosen": -260.93353271484375,
767
+ "logps/rejected": -221.60214233398438,
768
+ "loss": 1822.4361,
769
+ "rewards/accuracies": 0.6875,
770
+ "rewards/chosen": 0.10896619409322739,
771
+ "rewards/margins": 0.10793409496545792,
772
+ "rewards/rejected": 0.0010320901637896895,
773
+ "step": 490
774
+ },
775
+ {
776
+ "epoch": 1.6,
777
+ "learning_rate": 3.7459599848079965e-06,
778
+ "logits/chosen": 0.32330405712127686,
779
+ "logits/rejected": 0.49660053849220276,
780
+ "logps/chosen": -270.75567626953125,
781
+ "logps/rejected": -223.2246856689453,
782
+ "loss": 1917.2811,
783
+ "rewards/accuracies": 0.606249988079071,
784
+ "rewards/chosen": 0.1301083266735077,
785
+ "rewards/margins": 0.10672901570796967,
786
+ "rewards/rejected": 0.023379310965538025,
787
+ "step": 500
788
+ },
789
+ {
790
+ "epoch": 1.6,
791
+ "eval_logits/chosen": 0.4610958397388458,
792
+ "eval_logits/rejected": 0.5243986248970032,
793
+ "eval_logps/chosen": -268.1746826171875,
794
+ "eval_logps/rejected": -245.2342987060547,
795
+ "eval_loss": 2527.199462890625,
796
+ "eval_rewards/accuracies": 0.5115000009536743,
797
+ "eval_rewards/chosen": -0.11560481786727905,
798
+ "eval_rewards/margins": 0.0013554621255025268,
799
+ "eval_rewards/rejected": -0.11696028709411621,
800
+ "eval_runtime": 411.7771,
801
+ "eval_samples_per_second": 4.857,
802
+ "eval_steps_per_second": 1.214,
803
+ "step": 500
804
+ },
805
+ {
806
+ "epoch": 1.63,
807
+ "learning_rate": 3.684847614263898e-06,
808
+ "logits/chosen": 0.2653389871120453,
809
+ "logits/rejected": 0.5690582990646362,
810
+ "logps/chosen": -266.63861083984375,
811
+ "logps/rejected": -214.3343963623047,
812
+ "loss": 1769.2539,
813
+ "rewards/accuracies": 0.65625,
814
+ "rewards/chosen": 0.1135379895567894,
815
+ "rewards/margins": 0.13335202634334564,
816
+ "rewards/rejected": -0.01981404609978199,
817
+ "step": 510
818
+ },
819
+ {
820
+ "epoch": 1.66,
821
+ "learning_rate": 3.622808041617133e-06,
822
+ "logits/chosen": 0.23678083717823029,
823
+ "logits/rejected": 0.3923517167568207,
824
+ "logps/chosen": -251.62295532226562,
825
+ "logps/rejected": -213.0308074951172,
826
+ "loss": 1775.6238,
827
+ "rewards/accuracies": 0.699999988079071,
828
+ "rewards/chosen": 0.12413735687732697,
829
+ "rewards/margins": 0.11967913061380386,
830
+ "rewards/rejected": 0.0044582299888134,
831
+ "step": 520
832
+ },
833
+ {
834
+ "epoch": 1.7,
835
+ "learning_rate": 3.559889815914441e-06,
836
+ "logits/chosen": 0.28451576828956604,
837
+ "logits/rejected": 0.5491675138473511,
838
+ "logps/chosen": -242.1103057861328,
839
+ "logps/rejected": -196.47967529296875,
840
+ "loss": 1708.515,
841
+ "rewards/accuracies": 0.699999988079071,
842
+ "rewards/chosen": 0.13650254905223846,
843
+ "rewards/margins": 0.1416713297367096,
844
+ "rewards/rejected": -0.005168789066374302,
845
+ "step": 530
846
+ },
847
+ {
848
+ "epoch": 1.73,
849
+ "learning_rate": 3.496142173792219e-06,
850
+ "logits/chosen": 0.23486502468585968,
851
+ "logits/rejected": 0.4870077073574066,
852
+ "logps/chosen": -244.1999969482422,
853
+ "logps/rejected": -209.3772430419922,
854
+ "loss": 1659.4633,
855
+ "rewards/accuracies": 0.737500011920929,
856
+ "rewards/chosen": 0.13094991445541382,
857
+ "rewards/margins": 0.14943310618400574,
858
+ "rewards/rejected": -0.018483208492398262,
859
+ "step": 540
860
+ },
861
+ {
862
+ "epoch": 1.76,
863
+ "learning_rate": 3.4316150009464023e-06,
864
+ "logits/chosen": 0.2991487979888916,
865
+ "logits/rejected": 0.5229448080062866,
866
+ "logps/chosen": -264.29534912109375,
867
+ "logps/rejected": -222.90609741210938,
868
+ "loss": 1999.6244,
869
+ "rewards/accuracies": 0.643750011920929,
870
+ "rewards/chosen": 0.1066334992647171,
871
+ "rewards/margins": 0.0911371111869812,
872
+ "rewards/rejected": 0.015496388077735901,
873
+ "step": 550
874
+ },
875
+ {
876
+ "epoch": 1.79,
877
+ "learning_rate": 3.366358793094433e-06,
878
+ "logits/chosen": 0.18731743097305298,
879
+ "logits/rejected": 0.40507373213768005,
880
+ "logps/chosen": -245.1287078857422,
881
+ "logps/rejected": -194.5263214111328,
882
+ "loss": 1803.6146,
883
+ "rewards/accuracies": 0.6937500238418579,
884
+ "rewards/chosen": 0.15052266418933868,
885
+ "rewards/margins": 0.13785871863365173,
886
+ "rewards/rejected": 0.012663939967751503,
887
+ "step": 560
888
+ },
889
+ {
890
+ "epoch": 1.82,
891
+ "learning_rate": 3.3004246164598535e-06,
892
+ "logits/chosen": 0.20718684792518616,
893
+ "logits/rejected": 0.46155864000320435,
894
+ "logps/chosen": -270.30792236328125,
895
+ "logps/rejected": -233.5759735107422,
896
+ "loss": 1710.7586,
897
+ "rewards/accuracies": 0.668749988079071,
898
+ "rewards/chosen": 0.14311200380325317,
899
+ "rewards/margins": 0.14155760407447815,
900
+ "rewards/rejected": 0.0015544015914201736,
901
+ "step": 570
902
+ },
903
+ {
904
+ "epoch": 1.86,
905
+ "learning_rate": 3.233864067810446e-06,
906
+ "logits/chosen": 0.3184027075767517,
907
+ "logits/rejected": 0.4623110890388489,
908
+ "logps/chosen": -252.27920532226562,
909
+ "logps/rejected": -208.2698974609375,
910
+ "loss": 1681.7773,
911
+ "rewards/accuracies": 0.699999988079071,
912
+ "rewards/chosen": 0.137790247797966,
913
+ "rewards/margins": 0.15522876381874084,
914
+ "rewards/rejected": -0.017438489943742752,
915
+ "step": 580
916
+ },
917
+ {
918
+ "epoch": 1.89,
919
+ "learning_rate": 3.1667292340812077e-06,
920
+ "logits/chosen": 0.1437700092792511,
921
+ "logits/rejected": 0.3409765958786011,
922
+ "logps/chosen": -247.97341918945312,
923
+ "logps/rejected": -218.58163452148438,
924
+ "loss": 1856.1729,
925
+ "rewards/accuracies": 0.668749988079071,
926
+ "rewards/chosen": 0.13051538169384003,
927
+ "rewards/margins": 0.12071319669485092,
928
+ "rewards/rejected": 0.009802192449569702,
929
+ "step": 590
930
+ },
931
+ {
932
+ "epoch": 1.92,
933
+ "learning_rate": 3.099072651613728e-06,
934
+ "logits/chosen": 0.24014055728912354,
935
+ "logits/rejected": 0.3376144766807556,
936
+ "logps/chosen": -236.4192352294922,
937
+ "logps/rejected": -202.04861450195312,
938
+ "loss": 1799.3824,
939
+ "rewards/accuracies": 0.65625,
940
+ "rewards/chosen": 0.11552337557077408,
941
+ "rewards/margins": 0.12494631111621857,
942
+ "rewards/rejected": -0.009422937408089638,
943
+ "step": 600
944
+ },
945
+ {
946
+ "epoch": 1.92,
947
+ "eval_logits/chosen": 0.40748730301856995,
948
+ "eval_logits/rejected": 0.4713863432407379,
949
+ "eval_logps/chosen": -270.2481994628906,
950
+ "eval_logps/rejected": -247.35043334960938,
951
+ "eval_loss": 2534.42919921875,
952
+ "eval_rewards/accuracies": 0.5210000276565552,
953
+ "eval_rewards/chosen": -0.13634006679058075,
954
+ "eval_rewards/margins": 0.001781497965566814,
955
+ "eval_rewards/rejected": -0.13812156021595,
956
+ "eval_runtime": 412.3385,
957
+ "eval_samples_per_second": 4.85,
958
+ "eval_steps_per_second": 1.213,
959
+ "step": 600
960
+ },
961
+ {
962
+ "epoch": 1.95,
963
+ "learning_rate": 3.0309472650438982e-06,
964
+ "logits/chosen": 0.2844320833683014,
965
+ "logits/rejected": 0.44563204050064087,
966
+ "logps/chosen": -256.5829772949219,
967
+ "logps/rejected": -221.15725708007812,
968
+ "loss": 1905.6564,
969
+ "rewards/accuracies": 0.668749988079071,
970
+ "rewards/chosen": 0.10678652673959732,
971
+ "rewards/margins": 0.10849614441394806,
972
+ "rewards/rejected": -0.001709613250568509,
973
+ "step": 610
974
+ },
975
+ {
976
+ "epoch": 1.98,
977
+ "learning_rate": 2.9624063858701006e-06,
978
+ "logits/chosen": 0.2729710638523102,
979
+ "logits/rejected": 0.4474371075630188,
980
+ "logps/chosen": -252.75082397460938,
981
+ "logps/rejected": -210.4240264892578,
982
+ "loss": 1794.8119,
983
+ "rewards/accuracies": 0.6812499761581421,
984
+ "rewards/chosen": 0.11081516742706299,
985
+ "rewards/margins": 0.1245477944612503,
986
+ "rewards/rejected": -0.013732627034187317,
987
+ "step": 620
988
+ },
989
+ {
990
+ "epoch": 2.02,
991
+ "learning_rate": 2.8935036507343185e-06,
992
+ "logits/chosen": 0.20590758323669434,
993
+ "logits/rejected": 0.4346593916416168,
994
+ "logps/chosen": -262.78900146484375,
995
+ "logps/rejected": -220.1951141357422,
996
+ "loss": 1844.0162,
997
+ "rewards/accuracies": 0.643750011920929,
998
+ "rewards/chosen": 0.11512579768896103,
999
+ "rewards/margins": 0.12753179669380188,
1000
+ "rewards/rejected": -0.012406004592776299,
1001
+ "step": 630
1002
+ },
1003
+ {
1004
+ "epoch": 2.05,
1005
+ "learning_rate": 2.8242929794487926e-06,
1006
+ "logits/chosen": 0.18648579716682434,
1007
+ "logits/rejected": 0.4544282555580139,
1008
+ "logps/chosen": -268.1901550292969,
1009
+ "logps/rejected": -227.70193481445312,
1010
+ "loss": 1771.6072,
1011
+ "rewards/accuracies": 0.6875,
1012
+ "rewards/chosen": 0.1294766366481781,
1013
+ "rewards/margins": 0.14083310961723328,
1014
+ "rewards/rejected": -0.011356466449797153,
1015
+ "step": 640
1016
+ },
1017
+ {
1018
+ "epoch": 2.08,
1019
+ "learning_rate": 2.7548285328010984e-06,
1020
+ "logits/chosen": 0.26485538482666016,
1021
+ "logits/rejected": 0.36643001437187195,
1022
+ "logps/chosen": -250.9102783203125,
1023
+ "logps/rejected": -210.93991088867188,
1024
+ "loss": 1854.1963,
1025
+ "rewards/accuracies": 0.675000011920929,
1026
+ "rewards/chosen": 0.12583324313163757,
1027
+ "rewards/margins": 0.11704309284687042,
1028
+ "rewards/rejected": 0.008790150284767151,
1029
+ "step": 650
1030
+ },
1031
+ {
1032
+ "epoch": 2.11,
1033
+ "learning_rate": 2.6851646701706306e-06,
1034
+ "logits/chosen": 0.1529732346534729,
1035
+ "logits/rejected": 0.33808109164237976,
1036
+ "logps/chosen": -280.79803466796875,
1037
+ "logps/rejected": -242.85238647460938,
1038
+ "loss": 1942.4141,
1039
+ "rewards/accuracies": 0.5874999761581421,
1040
+ "rewards/chosen": 0.11561117321252823,
1041
+ "rewards/margins": 0.10545969009399414,
1042
+ "rewards/rejected": 0.010151493363082409,
1043
+ "step": 660
1044
+ },
1045
+ {
1046
+ "epoch": 2.14,
1047
+ "learning_rate": 2.6153559069897007e-06,
1048
+ "logits/chosen": 0.24622070789337158,
1049
+ "logits/rejected": 0.3663380444049835,
1050
+ "logps/chosen": -240.98788452148438,
1051
+ "logps/rejected": -203.41390991210938,
1052
+ "loss": 1846.866,
1053
+ "rewards/accuracies": 0.6812499761581421,
1054
+ "rewards/chosen": 0.10691628605127335,
1055
+ "rewards/margins": 0.11943434178829193,
1056
+ "rewards/rejected": -0.012518051080405712,
1057
+ "step": 670
1058
+ },
1059
+ {
1060
+ "epoch": 2.18,
1061
+ "learning_rate": 2.5454568720824937e-06,
1062
+ "logits/chosen": 0.14400359988212585,
1063
+ "logits/rejected": 0.4130190908908844,
1064
+ "logps/chosen": -238.1641082763672,
1065
+ "logps/rejected": -194.319580078125,
1066
+ "loss": 1685.1135,
1067
+ "rewards/accuracies": 0.731249988079071,
1068
+ "rewards/chosen": 0.1472087949514389,
1069
+ "rewards/margins": 0.141302227973938,
1070
+ "rewards/rejected": 0.005906577687710524,
1071
+ "step": 680
1072
+ },
1073
+ {
1074
+ "epoch": 2.21,
1075
+ "learning_rate": 2.4755222649153014e-06,
1076
+ "logits/chosen": 0.19388817250728607,
1077
+ "logits/rejected": 0.38251611590385437,
1078
+ "logps/chosen": -235.1546173095703,
1079
+ "logps/rejected": -187.75552368164062,
1080
+ "loss": 1779.3379,
1081
+ "rewards/accuracies": 0.6875,
1082
+ "rewards/chosen": 0.1266268938779831,
1083
+ "rewards/margins": 0.13590212166309357,
1084
+ "rewards/rejected": -0.009275219403207302,
1085
+ "step": 690
1086
+ },
1087
+ {
1088
+ "epoch": 2.24,
1089
+ "learning_rate": 2.4056068127914803e-06,
1090
+ "logits/chosen": 0.21440652012825012,
1091
+ "logits/rejected": 0.46250271797180176,
1092
+ "logps/chosen": -270.1151123046875,
1093
+ "logps/rejected": -226.77603149414062,
1094
+ "loss": 1751.5762,
1095
+ "rewards/accuracies": 0.668749988079071,
1096
+ "rewards/chosen": 0.12278898805379868,
1097
+ "rewards/margins": 0.15427103638648987,
1098
+ "rewards/rejected": -0.03148204833269119,
1099
+ "step": 700
1100
+ },
1101
+ {
1102
+ "epoch": 2.24,
1103
+ "eval_logits/chosen": 0.39058271050453186,
1104
+ "eval_logits/rejected": 0.45453789830207825,
1105
+ "eval_logps/chosen": -271.09881591796875,
1106
+ "eval_logps/rejected": -248.2779541015625,
1107
+ "eval_loss": 2531.35498046875,
1108
+ "eval_rewards/accuracies": 0.5180000066757202,
1109
+ "eval_rewards/chosen": -0.14484626054763794,
1110
+ "eval_rewards/margins": 0.0025505598168820143,
1111
+ "eval_rewards/rejected": -0.14739681780338287,
1112
+ "eval_runtime": 411.7503,
1113
+ "eval_samples_per_second": 4.857,
1114
+ "eval_steps_per_second": 1.214,
1115
+ "step": 700
1116
+ },
1117
+ {
1118
+ "epoch": 2.27,
1119
+ "learning_rate": 2.3357652280246125e-06,
1120
+ "logits/chosen": 0.22305497527122498,
1121
+ "logits/rejected": 0.4246363043785095,
1122
+ "logps/chosen": -244.64987182617188,
1123
+ "logps/rejected": -204.69613647460938,
1124
+ "loss": 1997.1527,
1125
+ "rewards/accuracies": 0.637499988079071,
1126
+ "rewards/chosen": 0.10293789952993393,
1127
+ "rewards/margins": 0.09320969879627228,
1128
+ "rewards/rejected": 0.009728200733661652,
1129
+ "step": 710
1130
+ },
1131
+ {
1132
+ "epoch": 2.3,
1133
+ "learning_rate": 2.2660521651234036e-06,
1134
+ "logits/chosen": 0.25884026288986206,
1135
+ "logits/rejected": 0.44062429666519165,
1136
+ "logps/chosen": -264.1467590332031,
1137
+ "logps/rejected": -212.3171844482422,
1138
+ "loss": 1703.6529,
1139
+ "rewards/accuracies": 0.6875,
1140
+ "rewards/chosen": 0.14418098330497742,
1141
+ "rewards/margins": 0.1545061618089676,
1142
+ "rewards/rejected": -0.0103251738473773,
1143
+ "step": 720
1144
+ },
1145
+ {
1146
+ "epoch": 2.34,
1147
+ "learning_rate": 2.1965221780218173e-06,
1148
+ "logits/chosen": 0.16370469331741333,
1149
+ "logits/rejected": 0.34310778975486755,
1150
+ "logps/chosen": -260.98602294921875,
1151
+ "logps/rejected": -223.19790649414062,
1152
+ "loss": 1666.0893,
1153
+ "rewards/accuracies": 0.6937500238418579,
1154
+ "rewards/chosen": 0.14241552352905273,
1155
+ "rewards/margins": 0.14847032725811005,
1156
+ "rewards/rejected": -0.006054792553186417,
1157
+ "step": 730
1158
+ },
1159
+ {
1160
+ "epoch": 2.37,
1161
+ "learning_rate": 2.1272296773879107e-06,
1162
+ "logits/chosen": 0.14960137009620667,
1163
+ "logits/rejected": 0.40142565965652466,
1164
+ "logps/chosen": -237.9215087890625,
1165
+ "logps/rejected": -199.8916015625,
1166
+ "loss": 1738.3658,
1167
+ "rewards/accuracies": 0.706250011920929,
1168
+ "rewards/chosen": 0.13602004945278168,
1169
+ "rewards/margins": 0.14607930183410645,
1170
+ "rewards/rejected": -0.010059243068099022,
1171
+ "step": 740
1172
+ },
1173
+ {
1174
+ "epoch": 2.4,
1175
+ "learning_rate": 2.058228888044788e-06,
1176
+ "logits/chosen": 0.2741914689540863,
1177
+ "logits/rejected": 0.3872790038585663,
1178
+ "logps/chosen": -223.29269409179688,
1179
+ "logps/rejected": -191.80650329589844,
1180
+ "loss": 1910.0223,
1181
+ "rewards/accuracies": 0.606249988079071,
1182
+ "rewards/chosen": 0.10628987848758698,
1183
+ "rewards/margins": 0.11196577548980713,
1184
+ "rewards/rejected": -0.00567590119317174,
1185
+ "step": 750
1186
+ },
1187
+ {
1188
+ "epoch": 2.43,
1189
+ "learning_rate": 1.989573806536978e-06,
1190
+ "logits/chosen": 0.18802039325237274,
1191
+ "logits/rejected": 0.4464651048183441,
1192
+ "logps/chosen": -256.7940979003906,
1193
+ "logps/rejected": -208.06430053710938,
1194
+ "loss": 1710.393,
1195
+ "rewards/accuracies": 0.706250011920929,
1196
+ "rewards/chosen": 0.15197846293449402,
1197
+ "rewards/margins": 0.1547323763370514,
1198
+ "rewards/rejected": -0.002753905486315489,
1199
+ "step": 760
1200
+ },
1201
+ {
1202
+ "epoch": 2.46,
1203
+ "learning_rate": 1.921318158875459e-06,
1204
+ "logits/chosen": 0.2419842779636383,
1205
+ "logits/rejected": 0.4348203241825104,
1206
+ "logps/chosen": -249.1728973388672,
1207
+ "logps/rejected": -207.82229614257812,
1208
+ "loss": 1884.0729,
1209
+ "rewards/accuracies": 0.65625,
1210
+ "rewards/chosen": 0.12094120681285858,
1211
+ "rewards/margins": 0.12061796337366104,
1212
+ "rewards/rejected": 0.0003232499584555626,
1213
+ "step": 770
1214
+ },
1215
+ {
1216
+ "epoch": 2.5,
1217
+ "learning_rate": 1.8535153584943915e-06,
1218
+ "logits/chosen": 0.2110445499420166,
1219
+ "logits/rejected": 0.4180677533149719,
1220
+ "logps/chosen": -257.97967529296875,
1221
+ "logps/rejected": -223.7866668701172,
1222
+ "loss": 1903.2082,
1223
+ "rewards/accuracies": 0.6187499761581421,
1224
+ "rewards/chosen": 0.12379207462072372,
1225
+ "rewards/margins": 0.1165727972984314,
1226
+ "rewards/rejected": 0.007219274528324604,
1227
+ "step": 780
1228
+ },
1229
+ {
1230
+ "epoch": 2.53,
1231
+ "learning_rate": 1.7862184644524422e-06,
1232
+ "logits/chosen": 0.2049965113401413,
1233
+ "logits/rejected": 0.4525831639766693,
1234
+ "logps/chosen": -256.8290710449219,
1235
+ "logps/rejected": -224.7571563720703,
1236
+ "loss": 1728.5514,
1237
+ "rewards/accuracies": 0.706250011920929,
1238
+ "rewards/chosen": 0.14060300588607788,
1239
+ "rewards/margins": 0.1399126946926117,
1240
+ "rewards/rejected": 0.0006903231260366738,
1241
+ "step": 790
1242
+ },
1243
+ {
1244
+ "epoch": 2.56,
1245
+ "learning_rate": 1.7194801399114471e-06,
1246
+ "logits/chosen": 0.1295299082994461,
1247
+ "logits/rejected": 0.38342705368995667,
1248
+ "logps/chosen": -265.95697021484375,
1249
+ "logps/rejected": -224.35922241210938,
1250
+ "loss": 1711.1711,
1251
+ "rewards/accuracies": 0.6812499761581421,
1252
+ "rewards/chosen": 0.1655883491039276,
1253
+ "rewards/margins": 0.14990000426769257,
1254
+ "rewards/rejected": 0.015688356012105942,
1255
+ "step": 800
1256
+ },
1257
+ {
1258
+ "epoch": 2.56,
1259
+ "eval_logits/chosen": 0.37592813372612,
1260
+ "eval_logits/rejected": 0.44024261832237244,
1261
+ "eval_logps/chosen": -271.4833984375,
1262
+ "eval_logps/rejected": -248.64402770996094,
1263
+ "eval_loss": 2536.2451171875,
1264
+ "eval_rewards/accuracies": 0.5145000219345093,
1265
+ "eval_rewards/chosen": -0.14869214594364166,
1266
+ "eval_rewards/margins": 0.0023655793629586697,
1267
+ "eval_rewards/rejected": -0.15105770528316498,
1268
+ "eval_runtime": 412.1594,
1269
+ "eval_samples_per_second": 4.852,
1270
+ "eval_steps_per_second": 1.213,
1271
+ "step": 800
1272
+ },
1273
+ {
1274
+ "epoch": 2.59,
1275
+ "learning_rate": 1.6533526109248632e-06,
1276
+ "logits/chosen": 0.22814953327178955,
1277
+ "logits/rejected": 0.44856709241867065,
1278
+ "logps/chosen": -264.6068115234375,
1279
+ "logps/rejected": -218.6339874267578,
1280
+ "loss": 1694.8418,
1281
+ "rewards/accuracies": 0.706250011920929,
1282
+ "rewards/chosen": 0.1343715339899063,
1283
+ "rewards/margins": 0.15012042224407196,
1284
+ "rewards/rejected": -0.015748897567391396,
1285
+ "step": 810
1286
+ },
1287
+ {
1288
+ "epoch": 2.62,
1289
+ "learning_rate": 1.5878876255682951e-06,
1290
+ "logits/chosen": 0.25933316349983215,
1291
+ "logits/rejected": 0.428168922662735,
1292
+ "logps/chosen": -271.8929748535156,
1293
+ "logps/rejected": -222.19677734375,
1294
+ "loss": 1799.3105,
1295
+ "rewards/accuracies": 0.675000011920929,
1296
+ "rewards/chosen": 0.13050588965415955,
1297
+ "rewards/margins": 0.12607057392597198,
1298
+ "rewards/rejected": 0.00443531759083271,
1299
+ "step": 820
1300
+ },
1301
+ {
1302
+ "epoch": 2.66,
1303
+ "learning_rate": 1.5231364134440485e-06,
1304
+ "logits/chosen": 0.16305533051490784,
1305
+ "logits/rejected": 0.37009209394454956,
1306
+ "logps/chosen": -239.7517547607422,
1307
+ "logps/rejected": -201.59359741210938,
1308
+ "loss": 1877.6279,
1309
+ "rewards/accuracies": 0.6625000238418579,
1310
+ "rewards/chosen": 0.13841071724891663,
1311
+ "rewards/margins": 0.12821929156780243,
1312
+ "rewards/rejected": 0.0101914182305336,
1313
+ "step": 830
1314
+ },
1315
+ {
1316
+ "epoch": 2.69,
1317
+ "learning_rate": 1.4591496455914292e-06,
1318
+ "logits/chosen": 0.23846232891082764,
1319
+ "logits/rejected": 0.44133177399635315,
1320
+ "logps/chosen": -269.0438232421875,
1321
+ "logps/rejected": -235.73202514648438,
1322
+ "loss": 1826.1201,
1323
+ "rewards/accuracies": 0.6499999761581421,
1324
+ "rewards/chosen": 0.12339715659618378,
1325
+ "rewards/margins": 0.12225867807865143,
1326
+ "rewards/rejected": 0.0011384893441572785,
1327
+ "step": 840
1328
+ },
1329
+ {
1330
+ "epoch": 2.72,
1331
+ "learning_rate": 1.395977394834132e-06,
1332
+ "logits/chosen": 0.2057628184556961,
1333
+ "logits/rejected": 0.457078218460083,
1334
+ "logps/chosen": -222.45474243164062,
1335
+ "logps/rejected": -178.09011840820312,
1336
+ "loss": 1674.3545,
1337
+ "rewards/accuracies": 0.7437499761581421,
1338
+ "rewards/chosen": 0.14612336456775665,
1339
+ "rewards/margins": 0.1515127718448639,
1340
+ "rewards/rejected": -0.005389401223510504,
1341
+ "step": 850
1342
+ },
1343
+ {
1344
+ "epoch": 2.75,
1345
+ "learning_rate": 1.3336690965957733e-06,
1346
+ "logits/chosen": 0.23303177952766418,
1347
+ "logits/rejected": 0.38948389887809753,
1348
+ "logps/chosen": -276.028564453125,
1349
+ "logps/rejected": -242.33120727539062,
1350
+ "loss": 1734.6137,
1351
+ "rewards/accuracies": 0.675000011920929,
1352
+ "rewards/chosen": 0.13224345445632935,
1353
+ "rewards/margins": 0.1367109715938568,
1354
+ "rewards/rejected": -0.004467502236366272,
1355
+ "step": 860
1356
+ },
1357
+ {
1358
+ "epoch": 2.78,
1359
+ "learning_rate": 1.2722735102142192e-06,
1360
+ "logits/chosen": 0.18385744094848633,
1361
+ "logits/rejected": 0.483973890542984,
1362
+ "logps/chosen": -234.6072540283203,
1363
+ "logps/rejected": -188.79147338867188,
1364
+ "loss": 1524.1409,
1365
+ "rewards/accuracies": 0.768750011920929,
1366
+ "rewards/chosen": 0.1483391523361206,
1367
+ "rewards/margins": 0.16734978556632996,
1368
+ "rewards/rejected": -0.019010629504919052,
1369
+ "step": 870
1370
+ },
1371
+ {
1372
+ "epoch": 2.82,
1373
+ "learning_rate": 1.2118386807849733e-06,
1374
+ "logits/chosen": 0.23927922546863556,
1375
+ "logits/rejected": 0.39777225255966187,
1376
+ "logps/chosen": -247.7241668701172,
1377
+ "logps/rejected": -209.2562255859375,
1378
+ "loss": 1804.3422,
1379
+ "rewards/accuracies": 0.6187499761581421,
1380
+ "rewards/chosen": 0.11453523486852646,
1381
+ "rewards/margins": 0.13239163160324097,
1382
+ "rewards/rejected": -0.0178564190864563,
1383
+ "step": 880
1384
+ },
1385
+ {
1386
+ "epoch": 2.85,
1387
+ "learning_rate": 1.1524119015635116e-06,
1388
+ "logits/chosen": 0.14219129085540771,
1389
+ "logits/rejected": 0.3456658124923706,
1390
+ "logps/chosen": -244.8931121826172,
1391
+ "logps/rejected": -194.36337280273438,
1392
+ "loss": 1655.0236,
1393
+ "rewards/accuracies": 0.768750011920929,
1394
+ "rewards/chosen": 0.17296788096427917,
1395
+ "rewards/margins": 0.17738190293312073,
1396
+ "rewards/rejected": -0.004414013121277094,
1397
+ "step": 890
1398
+ },
1399
+ {
1400
+ "epoch": 2.88,
1401
+ "learning_rate": 1.0940396769559584e-06,
1402
+ "logits/chosen": 0.23592603206634521,
1403
+ "logits/rejected": 0.3958457410335541,
1404
+ "logps/chosen": -242.33761596679688,
1405
+ "logps/rejected": -191.34255981445312,
1406
+ "loss": 1894.4447,
1407
+ "rewards/accuracies": 0.625,
1408
+ "rewards/chosen": 0.11816737800836563,
1409
+ "rewards/margins": 0.1154218316078186,
1410
+ "rewards/rejected": 0.002745547564700246,
1411
+ "step": 900
1412
+ },
1413
+ {
1414
+ "epoch": 2.88,
1415
+ "eval_logits/chosen": 0.3617529273033142,
1416
+ "eval_logits/rejected": 0.42615193128585815,
1417
+ "eval_logps/chosen": -272.0999755859375,
1418
+ "eval_logps/rejected": -249.2417449951172,
1419
+ "eval_loss": 2542.6298828125,
1420
+ "eval_rewards/accuracies": 0.5235000252723694,
1421
+ "eval_rewards/chosen": -0.15485772490501404,
1422
+ "eval_rewards/margins": 0.002177263842895627,
1423
+ "eval_rewards/rejected": -0.1570349782705307,
1424
+ "eval_runtime": 411.8184,
1425
+ "eval_samples_per_second": 4.857,
1426
+ "eval_steps_per_second": 1.214,
1427
+ "step": 900
1428
+ },
1429
+ {
1430
+ "epoch": 2.91,
1431
+ "learning_rate": 1.036767686127079e-06,
1432
+ "logits/chosen": 0.26092058420181274,
1433
+ "logits/rejected": 0.4202706217765808,
1434
+ "logps/chosen": -271.7019958496094,
1435
+ "logps/rejected": -229.44772338867188,
1436
+ "loss": 1612.1519,
1437
+ "rewards/accuracies": 0.7124999761581421,
1438
+ "rewards/chosen": 0.1469217836856842,
1439
+ "rewards/margins": 0.1562378704547882,
1440
+ "rewards/rejected": -0.009316088631749153,
1441
+ "step": 910
1442
+ },
1443
+ {
1444
+ "epoch": 2.94,
1445
+ "learning_rate": 9.806407472540644e-07,
1446
+ "logits/chosen": 0.16799505054950714,
1447
+ "logits/rejected": 0.41259631514549255,
1448
+ "logps/chosen": -245.1134033203125,
1449
+ "logps/rejected": -193.49716186523438,
1450
+ "loss": 1752.974,
1451
+ "rewards/accuracies": 0.675000011920929,
1452
+ "rewards/chosen": 0.1420082449913025,
1453
+ "rewards/margins": 0.16994443535804749,
1454
+ "rewards/rejected": -0.027936194092035294,
1455
+ "step": 920
1456
+ },
1457
+ {
1458
+ "epoch": 2.98,
1459
+ "learning_rate": 9.257027824540823e-07,
1460
+ "logits/chosen": 0.15648850798606873,
1461
+ "logits/rejected": 0.3334013819694519,
1462
+ "logps/chosen": -237.5020294189453,
1463
+ "logps/rejected": -190.00733947753906,
1464
+ "loss": 1678.6684,
1465
+ "rewards/accuracies": 0.75,
1466
+ "rewards/chosen": 0.1168346032500267,
1467
+ "rewards/margins": 0.14195187389850616,
1468
+ "rewards/rejected": -0.02511727437376976,
1469
+ "step": 930
1470
+ },
1471
+ {
1472
+ "epoch": 3.01,
1473
+ "learning_rate": 8.719967834130385e-07,
1474
+ "logits/chosen": 0.17559798061847687,
1475
+ "logits/rejected": 0.38583293557167053,
1476
+ "logps/chosen": -226.3419647216797,
1477
+ "logps/rejected": -192.8260498046875,
1478
+ "loss": 1782.9469,
1479
+ "rewards/accuracies": 0.675000011920929,
1480
+ "rewards/chosen": 0.13087774813175201,
1481
+ "rewards/margins": 0.14290814101696014,
1482
+ "rewards/rejected": -0.012030376121401787,
1483
+ "step": 940
1484
+ },
1485
+ {
1486
+ "epoch": 3.04,
1487
+ "learning_rate": 8.195647777424479e-07,
1488
+ "logits/chosen": 0.19569836556911469,
1489
+ "logits/rejected": 0.420011043548584,
1490
+ "logps/chosen": -243.2490692138672,
1491
+ "logps/rejected": -200.2803497314453,
1492
+ "loss": 1728.6527,
1493
+ "rewards/accuracies": 0.6812499761581421,
1494
+ "rewards/chosen": 0.1271076500415802,
1495
+ "rewards/margins": 0.14936277270317078,
1496
+ "rewards/rejected": -0.02225511521100998,
1497
+ "step": 950
1498
+ },
1499
+ {
1500
+ "epoch": 3.07,
1501
+ "learning_rate": 7.684477960907422e-07,
1502
+ "logits/chosen": 0.2223655879497528,
1503
+ "logits/rejected": 0.36084768176078796,
1504
+ "logps/chosen": -259.83001708984375,
1505
+ "logps/rejected": -217.5656280517578,
1506
+ "loss": 1787.5324,
1507
+ "rewards/accuracies": 0.699999988079071,
1508
+ "rewards/chosen": 0.13919194042682648,
1509
+ "rewards/margins": 0.12921075522899628,
1510
+ "rewards/rejected": 0.009981190785765648,
1511
+ "step": 960
1512
+ },
1513
+ {
1514
+ "epoch": 3.1,
1515
+ "learning_rate": 7.186858400347455e-07,
1516
+ "logits/chosen": 0.15693505108356476,
1517
+ "logits/rejected": 0.35678160190582275,
1518
+ "logps/chosen": -266.93768310546875,
1519
+ "logps/rejected": -233.291259765625,
1520
+ "loss": 1748.1148,
1521
+ "rewards/accuracies": 0.6812499761581421,
1522
+ "rewards/chosen": 0.12252594530582428,
1523
+ "rewards/margins": 0.13253001868724823,
1524
+ "rewards/rejected": -0.010004105977714062,
1525
+ "step": 970
1526
+ },
1527
+ {
1528
+ "epoch": 3.14,
1529
+ "learning_rate": 6.703178507764618e-07,
1530
+ "logits/chosen": 0.21335263550281525,
1531
+ "logits/rejected": 0.3721942901611328,
1532
+ "logps/chosen": -250.0565948486328,
1533
+ "logps/rejected": -213.24624633789062,
1534
+ "loss": 1792.84,
1535
+ "rewards/accuracies": 0.6937500238418579,
1536
+ "rewards/chosen": 0.1454544961452484,
1537
+ "rewards/margins": 0.12651817500591278,
1538
+ "rewards/rejected": 0.018936317414045334,
1539
+ "step": 980
1540
+ },
1541
+ {
1542
+ "epoch": 3.17,
1543
+ "learning_rate": 6.233816786696414e-07,
1544
+ "logits/chosen": 0.17327558994293213,
1545
+ "logits/rejected": 0.41259893774986267,
1546
+ "logps/chosen": -242.4098358154297,
1547
+ "logps/rejected": -191.53460693359375,
1548
+ "loss": 1636.6745,
1549
+ "rewards/accuracies": 0.6875,
1550
+ "rewards/chosen": 0.16031518578529358,
1551
+ "rewards/margins": 0.1668807417154312,
1552
+ "rewards/rejected": -0.006565576884895563,
1553
+ "step": 990
1554
+ },
1555
+ {
1556
+ "epoch": 3.2,
1557
+ "learning_rate": 5.77914053600005e-07,
1558
+ "logits/chosen": 0.15771836042404175,
1559
+ "logits/rejected": 0.384365975856781,
1560
+ "logps/chosen": -247.66213989257812,
1561
+ "logps/rejected": -207.638427734375,
1562
+ "loss": 1798.5389,
1563
+ "rewards/accuracies": 0.675000011920929,
1564
+ "rewards/chosen": 0.13780926167964935,
1565
+ "rewards/margins": 0.1279788464307785,
1566
+ "rewards/rejected": 0.00983042549341917,
1567
+ "step": 1000
1568
+ },
1569
+ {
1570
+ "epoch": 3.2,
1571
+ "eval_logits/chosen": 0.3559388518333435,
1572
+ "eval_logits/rejected": 0.42022594809532166,
1573
+ "eval_logps/chosen": -272.4200439453125,
1574
+ "eval_logps/rejected": -249.57797241210938,
1575
+ "eval_loss": 2542.728759765625,
1576
+ "eval_rewards/accuracies": 0.5205000042915344,
1577
+ "eval_rewards/chosen": -0.15805859863758087,
1578
+ "eval_rewards/margins": 0.0023383116349577904,
1579
+ "eval_rewards/rejected": -0.16039690375328064,
1580
+ "eval_runtime": 411.7713,
1581
+ "eval_samples_per_second": 4.857,
1582
+ "eval_steps_per_second": 1.214,
1583
+ "step": 1000
1584
+ },
1585
+ {
1586
+ "epoch": 3.23,
1587
+ "learning_rate": 5.339505562422851e-07,
1588
+ "logits/chosen": 0.18551324307918549,
1589
+ "logits/rejected": 0.3874700367450714,
1590
+ "logps/chosen": -258.8070983886719,
1591
+ "logps/rejected": -208.31655883789062,
1592
+ "loss": 1684.7502,
1593
+ "rewards/accuracies": 0.699999988079071,
1594
+ "rewards/chosen": 0.13547667860984802,
1595
+ "rewards/margins": 0.14734682440757751,
1596
+ "rewards/rejected": -0.011870155110955238,
1597
+ "step": 1010
1598
+ },
1599
+ {
1600
+ "epoch": 3.26,
1601
+ "learning_rate": 4.915255902165734e-07,
1602
+ "logits/chosen": 0.16962969303131104,
1603
+ "logits/rejected": 0.3440442383289337,
1604
+ "logps/chosen": -257.40386962890625,
1605
+ "logps/rejected": -213.15682983398438,
1606
+ "loss": 1727.4756,
1607
+ "rewards/accuracies": 0.675000011920929,
1608
+ "rewards/chosen": 0.1502576470375061,
1609
+ "rewards/margins": 0.15001319348812103,
1610
+ "rewards/rejected": 0.000244450056925416,
1611
+ "step": 1020
1612
+ },
1613
+ {
1614
+ "epoch": 3.3,
1615
+ "learning_rate": 4.506723551657879e-07,
1616
+ "logits/chosen": 0.08985213190317154,
1617
+ "logits/rejected": 0.3018781542778015,
1618
+ "logps/chosen": -238.0452117919922,
1619
+ "logps/rejected": -200.73129272460938,
1620
+ "loss": 1795.9609,
1621
+ "rewards/accuracies": 0.6875,
1622
+ "rewards/chosen": 0.1125258356332779,
1623
+ "rewards/margins": 0.11527595669031143,
1624
+ "rewards/rejected": -0.0027501187287271023,
1625
+ "step": 1030
1626
+ },
1627
+ {
1628
+ "epoch": 3.33,
1629
+ "learning_rate": 4.11422820775299e-07,
1630
+ "logits/chosen": 0.18122617900371552,
1631
+ "logits/rejected": 0.3776589035987854,
1632
+ "logps/chosen": -273.4202575683594,
1633
+ "logps/rejected": -214.76803588867188,
1634
+ "loss": 1575.2917,
1635
+ "rewards/accuracies": 0.675000011920929,
1636
+ "rewards/chosen": 0.14816126227378845,
1637
+ "rewards/margins": 0.1801377683877945,
1638
+ "rewards/rejected": -0.03197649493813515,
1639
+ "step": 1040
1640
+ },
1641
+ {
1642
+ "epoch": 3.36,
1643
+ "learning_rate": 3.7380770175506397e-07,
1644
+ "logits/chosen": 0.18917641043663025,
1645
+ "logits/rejected": 0.3701549172401428,
1646
+ "logps/chosen": -236.4595184326172,
1647
+ "logps/rejected": -196.47378540039062,
1648
+ "loss": 1757.2316,
1649
+ "rewards/accuracies": 0.7250000238418579,
1650
+ "rewards/chosen": 0.13911323249340057,
1651
+ "rewards/margins": 0.14288835227489471,
1652
+ "rewards/rejected": -0.0037751286290585995,
1653
+ "step": 1050
1654
+ },
1655
+ {
1656
+ "epoch": 3.39,
1657
+ "learning_rate": 3.3785643380384063e-07,
1658
+ "logits/chosen": 0.22518393397331238,
1659
+ "logits/rejected": 0.4300554394721985,
1660
+ "logps/chosen": -264.11102294921875,
1661
+ "logps/rejected": -221.0089569091797,
1662
+ "loss": 1780.259,
1663
+ "rewards/accuracies": 0.675000011920929,
1664
+ "rewards/chosen": 0.12094493955373764,
1665
+ "rewards/margins": 0.13927613198757172,
1666
+ "rewards/rejected": -0.018331199884414673,
1667
+ "step": 1060
1668
+ },
1669
+ {
1670
+ "epoch": 3.42,
1671
+ "learning_rate": 3.0359715057429186e-07,
1672
+ "logits/chosen": 0.15539857745170593,
1673
+ "logits/rejected": 0.330243319272995,
1674
+ "logps/chosen": -249.2795867919922,
1675
+ "logps/rejected": -213.9594268798828,
1676
+ "loss": 1857.3367,
1677
+ "rewards/accuracies": 0.668749988079071,
1678
+ "rewards/chosen": 0.11874745041131973,
1679
+ "rewards/margins": 0.1254003345966339,
1680
+ "rewards/rejected": -0.0066528706811368465,
1681
+ "step": 1070
1682
+ },
1683
+ {
1684
+ "epoch": 3.46,
1685
+ "learning_rate": 2.710566616570048e-07,
1686
+ "logits/chosen": 0.22669236361980438,
1687
+ "logits/rejected": 0.3851965069770813,
1688
+ "logps/chosen": -243.06094360351562,
1689
+ "logps/rejected": -207.26522827148438,
1690
+ "loss": 1802.6373,
1691
+ "rewards/accuracies": 0.675000011920929,
1692
+ "rewards/chosen": 0.11891194432973862,
1693
+ "rewards/margins": 0.1296943724155426,
1694
+ "rewards/rejected": -0.010782415047287941,
1695
+ "step": 1080
1696
+ },
1697
+ {
1698
+ "epoch": 3.49,
1699
+ "learning_rate": 2.40260431600654e-07,
1700
+ "logits/chosen": 0.11191525310277939,
1701
+ "logits/rejected": 0.3566216826438904,
1702
+ "logps/chosen": -258.7408142089844,
1703
+ "logps/rejected": -233.2893524169922,
1704
+ "loss": 1846.4627,
1705
+ "rewards/accuracies": 0.6499999761581421,
1706
+ "rewards/chosen": 0.13531699776649475,
1707
+ "rewards/margins": 0.1306256651878357,
1708
+ "rewards/rejected": 0.004691324662417173,
1709
+ "step": 1090
1710
+ },
1711
+ {
1712
+ "epoch": 3.52,
1713
+ "learning_rate": 2.1123255998472952e-07,
1714
+ "logits/chosen": 0.13580968976020813,
1715
+ "logits/rejected": 0.3724610209465027,
1716
+ "logps/chosen": -238.85391235351562,
1717
+ "logps/rejected": -202.35397338867188,
1718
+ "loss": 1834.9711,
1719
+ "rewards/accuracies": 0.65625,
1720
+ "rewards/chosen": 0.1293712556362152,
1721
+ "rewards/margins": 0.12213204801082611,
1722
+ "rewards/rejected": 0.007239216007292271,
1723
+ "step": 1100
1724
+ },
1725
+ {
1726
+ "epoch": 3.52,
1727
+ "eval_logits/chosen": 0.3531816899776459,
1728
+ "eval_logits/rejected": 0.417733371257782,
1729
+ "eval_logps/chosen": -272.4703369140625,
1730
+ "eval_logps/rejected": -249.63449096679688,
1731
+ "eval_loss": 2542.2373046875,
1732
+ "eval_rewards/accuracies": 0.5205000042915344,
1733
+ "eval_rewards/chosen": -0.15856170654296875,
1734
+ "eval_rewards/margins": 0.0024005102459341288,
1735
+ "eval_rewards/rejected": -0.16096222400665283,
1736
+ "eval_runtime": 411.8208,
1737
+ "eval_samples_per_second": 4.856,
1738
+ "eval_steps_per_second": 1.214,
1739
+ "step": 1100
1740
+ },
1741
+ {
1742
+ "epoch": 3.55,
1743
+ "learning_rate": 1.8399576256041525e-07,
1744
+ "logits/chosen": 0.27907487750053406,
1745
+ "logits/rejected": 0.39952999353408813,
1746
+ "logps/chosen": -293.59210205078125,
1747
+ "logps/rejected": -251.298583984375,
1748
+ "loss": 1979.5523,
1749
+ "rewards/accuracies": 0.5874999761581421,
1750
+ "rewards/chosen": 0.12343426048755646,
1751
+ "rewards/margins": 0.1068718284368515,
1752
+ "rewards/rejected": 0.016562417149543762,
1753
+ "step": 1110
1754
+ },
1755
+ {
1756
+ "epoch": 3.58,
1757
+ "learning_rate": 1.58571353474391e-07,
1758
+ "logits/chosen": 0.22743897140026093,
1759
+ "logits/rejected": 0.40824824571609497,
1760
+ "logps/chosen": -248.27365112304688,
1761
+ "logps/rejected": -217.0345916748047,
1762
+ "loss": 1971.9447,
1763
+ "rewards/accuracies": 0.6000000238418579,
1764
+ "rewards/chosen": 0.10497580468654633,
1765
+ "rewards/margins": 0.10696852207183838,
1766
+ "rewards/rejected": -0.0019927166868001223,
1767
+ "step": 1120
1768
+ },
1769
+ {
1770
+ "epoch": 3.62,
1771
+ "learning_rate": 1.3497922858944857e-07,
1772
+ "logits/chosen": 0.21788544952869415,
1773
+ "logits/rejected": 0.3408287763595581,
1774
+ "logps/chosen": -261.25634765625,
1775
+ "logps/rejected": -226.1531982421875,
1776
+ "loss": 1813.0029,
1777
+ "rewards/accuracies": 0.668749988079071,
1778
+ "rewards/chosen": 0.12613138556480408,
1779
+ "rewards/margins": 0.13940462470054626,
1780
+ "rewards/rejected": -0.013273234479129314,
1781
+ "step": 1130
1782
+ },
1783
+ {
1784
+ "epoch": 3.65,
1785
+ "learning_rate": 1.1323784991499471e-07,
1786
+ "logits/chosen": 0.17835107445716858,
1787
+ "logits/rejected": 0.41546598076820374,
1788
+ "logps/chosen": -254.427978515625,
1789
+ "logps/rejected": -208.0858917236328,
1790
+ "loss": 1688.9838,
1791
+ "rewards/accuracies": 0.75,
1792
+ "rewards/chosen": 0.13813026249408722,
1793
+ "rewards/margins": 0.1528724581003189,
1794
+ "rewards/rejected": -0.014742200262844563,
1795
+ "step": 1140
1796
+ },
1797
+ {
1798
+ "epoch": 3.68,
1799
+ "learning_rate": 9.336423115961002e-08,
1800
+ "logits/chosen": 0.13116911053657532,
1801
+ "logits/rejected": 0.34874704480171204,
1802
+ "logps/chosen": -251.4066162109375,
1803
+ "logps/rejected": -205.0511016845703,
1804
+ "loss": 1693.0176,
1805
+ "rewards/accuracies": 0.706250011920929,
1806
+ "rewards/chosen": 0.14480547606945038,
1807
+ "rewards/margins": 0.1530844271183014,
1808
+ "rewards/rejected": -0.008278947323560715,
1809
+ "step": 1150
1810
+ },
1811
+ {
1812
+ "epoch": 3.71,
1813
+ "learning_rate": 7.537392441697793e-08,
1814
+ "logits/chosen": 0.1982724368572235,
1815
+ "logits/rejected": 0.3957718014717102,
1816
+ "logps/chosen": -265.66033935546875,
1817
+ "logps/rejected": -223.394775390625,
1818
+ "loss": 1694.5328,
1819
+ "rewards/accuracies": 0.731249988079071,
1820
+ "rewards/chosen": 0.12068979442119598,
1821
+ "rewards/margins": 0.15094828605651855,
1822
+ "rewards/rejected": -0.030258500948548317,
1823
+ "step": 1160
1824
+ },
1825
+ {
1826
+ "epoch": 3.74,
1827
+ "learning_rate": 5.928100799559938e-08,
1828
+ "logits/chosen": 0.2445194274187088,
1829
+ "logits/rejected": 0.34973251819610596,
1830
+ "logps/chosen": -266.8522033691406,
1831
+ "logps/rejected": -214.7125701904297,
1832
+ "loss": 1810.5473,
1833
+ "rewards/accuracies": 0.65625,
1834
+ "rewards/chosen": 0.12299786508083344,
1835
+ "rewards/margins": 0.13123488426208496,
1836
+ "rewards/rejected": -0.008237012661993504,
1837
+ "step": 1170
1838
+ },
1839
+ {
1840
+ "epoch": 3.78,
1841
+ "learning_rate": 4.5098075401815435e-08,
1842
+ "logits/chosen": 0.17370513081550598,
1843
+ "logits/rejected": 0.44302305579185486,
1844
+ "logps/chosen": -251.4030303955078,
1845
+ "logps/rejected": -217.8542938232422,
1846
+ "loss": 1799.0785,
1847
+ "rewards/accuracies": 0.65625,
1848
+ "rewards/chosen": 0.14102791249752045,
1849
+ "rewards/margins": 0.14266334474086761,
1850
+ "rewards/rejected": -0.0016354346880689263,
1851
+ "step": 1180
1852
+ },
1853
+ {
1854
+ "epoch": 3.81,
1855
+ "learning_rate": 3.283622548476445e-08,
1856
+ "logits/chosen": 0.2375222146511078,
1857
+ "logits/rejected": 0.45388326048851013,
1858
+ "logps/chosen": -243.04116821289062,
1859
+ "logps/rejected": -203.55050659179688,
1860
+ "loss": 1673.1176,
1861
+ "rewards/accuracies": 0.643750011920929,
1862
+ "rewards/chosen": 0.13139726221561432,
1863
+ "rewards/margins": 0.14136573672294617,
1864
+ "rewards/rejected": -0.009968474507331848,
1865
+ "step": 1190
1866
+ },
1867
+ {
1868
+ "epoch": 3.84,
1869
+ "learning_rate": 2.250505375098161e-08,
1870
+ "logits/chosen": 0.28876617550849915,
1871
+ "logits/rejected": 0.42698416113853455,
1872
+ "logps/chosen": -248.3870086669922,
1873
+ "logps/rejected": -211.6997833251953,
1874
+ "loss": 1765.5148,
1875
+ "rewards/accuracies": 0.6812499761581421,
1876
+ "rewards/chosen": 0.1342637687921524,
1877
+ "rewards/margins": 0.1372515708208084,
1878
+ "rewards/rejected": -0.002987801330164075,
1879
+ "step": 1200
1880
+ },
1881
+ {
1882
+ "epoch": 3.84,
1883
+ "eval_logits/chosen": 0.35145023465156555,
1884
+ "eval_logits/rejected": 0.41600194573402405,
1885
+ "eval_logps/chosen": -272.50103759765625,
1886
+ "eval_logps/rejected": -249.63572692871094,
1887
+ "eval_loss": 2546.17138671875,
1888
+ "eval_rewards/accuracies": 0.5220000147819519,
1889
+ "eval_rewards/chosen": -0.15886859595775604,
1890
+ "eval_rewards/margins": 0.00210593082010746,
1891
+ "eval_rewards/rejected": -0.16097451746463776,
1892
+ "eval_runtime": 411.6701,
1893
+ "eval_samples_per_second": 4.858,
1894
+ "eval_steps_per_second": 1.215,
1895
+ "step": 1200
1896
+ },
1897
+ {
1898
+ "epoch": 3.87,
1899
+ "learning_rate": 1.4112644855438228e-08,
1900
+ "logits/chosen": 0.18522273004055023,
1901
+ "logits/rejected": 0.3528333306312561,
1902
+ "logps/chosen": -252.374755859375,
1903
+ "logps/rejected": -213.50112915039062,
1904
+ "loss": 1695.0207,
1905
+ "rewards/accuracies": 0.737500011920929,
1906
+ "rewards/chosen": 0.1518661379814148,
1907
+ "rewards/margins": 0.1503283679485321,
1908
+ "rewards/rejected": 0.001537766307592392,
1909
+ "step": 1210
1910
+ },
1911
+ {
1912
+ "epoch": 3.9,
1913
+ "learning_rate": 7.665566274897007e-09,
1914
+ "logits/chosen": 0.14011794328689575,
1915
+ "logits/rejected": 0.45459121465682983,
1916
+ "logps/chosen": -269.1119079589844,
1917
+ "logps/rejected": -213.98684692382812,
1918
+ "loss": 1599.6027,
1919
+ "rewards/accuracies": 0.731249988079071,
1920
+ "rewards/chosen": 0.1741984784603119,
1921
+ "rewards/margins": 0.17417994141578674,
1922
+ "rewards/rejected": 1.8526614439906552e-05,
1923
+ "step": 1220
1924
+ },
1925
+ {
1926
+ "epoch": 3.94,
1927
+ "learning_rate": 3.1688631685364292e-09,
1928
+ "logits/chosen": 0.11093461513519287,
1929
+ "logits/rejected": 0.3354471027851105,
1930
+ "logps/chosen": -240.45419311523438,
1931
+ "logps/rejected": -200.0811004638672,
1932
+ "loss": 1762.0594,
1933
+ "rewards/accuracies": 0.7124999761581421,
1934
+ "rewards/chosen": 0.14106154441833496,
1935
+ "rewards/margins": 0.14005112648010254,
1936
+ "rewards/rejected": 0.0010104203829541802,
1937
+ "step": 1230
1938
+ },
1939
+ {
1940
+ "epoch": 3.97,
1941
+ "learning_rate": 6.260544298619664e-10,
1942
+ "logits/chosen": 0.1383497416973114,
1943
+ "logits/rejected": 0.3753938674926758,
1944
+ "logps/chosen": -235.23385620117188,
1945
+ "logps/rejected": -191.31582641601562,
1946
+ "loss": 1685.3523,
1947
+ "rewards/accuracies": 0.71875,
1948
+ "rewards/chosen": 0.15916982293128967,
1949
+ "rewards/margins": 0.1575343906879425,
1950
+ "rewards/rejected": 0.0016354434192180634,
1951
+ "step": 1240
1952
+ },
1953
+ {
1954
+ "epoch": 3.99,
1955
+ "step": 1248,
1956
+ "total_flos": 0.0,
1957
+ "train_loss": 1918.4646684695513,
1958
+ "train_runtime": 14252.7073,
1959
+ "train_samples_per_second": 1.403,
1960
+ "train_steps_per_second": 0.088
1961
+ }
1962
+ ],
1963
+ "logging_steps": 10,
1964
+ "max_steps": 1248,
1965
+ "num_input_tokens_seen": 0,
1966
+ "num_train_epochs": 4,
1967
+ "save_steps": 100,
1968
+ "total_flos": 0.0,
1969
+ "train_batch_size": 4,
1970
+ "trial_name": null,
1971
+ "trial_params": null
1972
+ }