BraylonDash commited on
Commit
249e3b5
1 Parent(s): 2190ad7

Model save

Browse files
README.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ base_model: microsoft/phi-2
9
+ model-index:
10
+ - name: phi-2-gpo-renew2-b0.001-extra-v2-i1
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # phi-2-gpo-renew2-b0.001-extra-v2-i1
18
+
19
+ This model is a fine-tuned version of [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) on the None dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.0388
22
+ - Rewards/chosen: 0.0266
23
+ - Rewards/rejected: -0.0126
24
+ - Rewards/accuracies: 0.6070
25
+ - Rewards/margins: 0.0392
26
+ - Logps/rejected: -379.8497
27
+ - Logps/chosen: -369.7509
28
+ - Logits/rejected: -0.9196
29
+ - Logits/chosen: -0.9539
30
+
31
+ ## Model description
32
+
33
+ More information needed
34
+
35
+ ## Intended uses & limitations
36
+
37
+ More information needed
38
+
39
+ ## Training and evaluation data
40
+
41
+ More information needed
42
+
43
+ ## Training procedure
44
+
45
+ ### Training hyperparameters
46
+
47
+ The following hyperparameters were used during training:
48
+ - learning_rate: 5e-06
49
+ - train_batch_size: 4
50
+ - eval_batch_size: 4
51
+ - seed: 42
52
+ - distributed_type: multi-GPU
53
+ - gradient_accumulation_steps: 4
54
+ - total_train_batch_size: 16
55
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
56
+ - lr_scheduler_type: cosine
57
+ - lr_scheduler_warmup_ratio: 0.1
58
+ - num_epochs: 1
59
+
60
+ ### Training results
61
+
62
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
63
+ |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
64
+ | 0.098 | 0.06 | 100 | 0.0533 | -0.0029 | -0.0036 | 0.4980 | 0.0007 | -370.8433 | -399.2503 | -0.7225 | -0.8171 |
65
+ | 0.094 | 0.13 | 200 | 0.0491 | -0.0390 | -0.0525 | 0.5525 | 0.0135 | -419.6949 | -435.2693 | -1.0754 | -1.1388 |
66
+ | 0.0898 | 0.19 | 300 | 0.0452 | -0.0184 | -0.0403 | 0.5780 | 0.0218 | -407.5088 | -414.7480 | -1.0291 | -1.0858 |
67
+ | 0.0731 | 0.26 | 400 | 0.0430 | -0.0069 | -0.0331 | 0.5970 | 0.0262 | -400.2979 | -403.1916 | -0.9864 | -1.0412 |
68
+ | 0.0787 | 0.32 | 500 | 0.0422 | -0.0122 | -0.0473 | 0.6070 | 0.0351 | -414.4887 | -408.4566 | -1.0587 | -1.0975 |
69
+ | 0.0742 | 0.38 | 600 | 0.0406 | 0.0135 | -0.0175 | 0.6085 | 0.0309 | -384.7105 | -382.8363 | -0.9872 | -1.0246 |
70
+ | 0.0635 | 0.45 | 700 | 0.0401 | 0.0166 | -0.0188 | 0.6095 | 0.0354 | -386.0258 | -379.6696 | -0.9903 | -1.0225 |
71
+ | 0.0881 | 0.51 | 800 | 0.0395 | 0.0250 | -0.0102 | 0.6085 | 0.0352 | -377.4323 | -371.2672 | -0.9658 | -0.9975 |
72
+ | 0.0753 | 0.58 | 900 | 0.0393 | 0.0304 | -0.0046 | 0.5990 | 0.0350 | -371.7872 | -365.8699 | -0.9026 | -0.9456 |
73
+ | 0.0922 | 0.64 | 1000 | 0.0390 | 0.0286 | -0.0075 | 0.5990 | 0.0361 | -374.7669 | -367.7319 | -0.8801 | -0.9184 |
74
+ | 0.0703 | 0.7 | 1100 | 0.0389 | 0.0227 | -0.0161 | 0.6000 | 0.0387 | -383.3026 | -373.6226 | -0.9300 | -0.9602 |
75
+ | 0.0746 | 0.77 | 1200 | 0.0388 | 0.0226 | -0.0179 | 0.6050 | 0.0405 | -385.1601 | -373.7153 | -0.8944 | -0.9306 |
76
+ | 0.0925 | 0.83 | 1300 | 0.0387 | 0.0263 | -0.0131 | 0.6030 | 0.0393 | -380.3072 | -370.0340 | -0.9171 | -0.9494 |
77
+ | 0.0863 | 0.9 | 1400 | 0.0387 | 0.0269 | -0.0123 | 0.6055 | 0.0392 | -379.5608 | -369.4450 | -0.9121 | -0.9447 |
78
+ | 0.0904 | 0.96 | 1500 | 0.0386 | 0.0268 | -0.0124 | 0.6045 | 0.0392 | -379.6000 | -369.4944 | -0.9203 | -0.9536 |
79
+
80
+
81
+ ### Framework versions
82
+
83
+ - PEFT 0.7.1
84
+ - Transformers 4.36.2
85
+ - Pytorch 2.1.2
86
+ - Datasets 2.14.6
87
+ - Tokenizers 0.15.2
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b0c86b22f417733eab63137db51ac626549db500d3d7d86a2710982489b86f5
3
  size 167807296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:575c533757ec39ed9b3aec4f881a7caece463a6ba3c82324f139f9e776da1955
3
  size 167807296
all_results.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_logits/chosen": -0.9538615942001343,
4
+ "eval_logits/rejected": -0.9195562601089478,
5
+ "eval_logps/chosen": -369.7508544921875,
6
+ "eval_logps/rejected": -379.8497314453125,
7
+ "eval_loss": 0.03875432908535004,
8
+ "eval_rewards/accuracies": 0.6069999933242798,
9
+ "eval_rewards/chosen": 0.02655443549156189,
10
+ "eval_rewards/margins": 0.03916873782873154,
11
+ "eval_rewards/rejected": -0.012614301405847073,
12
+ "eval_runtime": 539.9666,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 3.704,
15
+ "eval_steps_per_second": 0.926,
16
+ "train_loss": 0.08419492204701015,
17
+ "train_runtime": 22113.8812,
18
+ "train_samples": 61135,
19
+ "train_samples_per_second": 1.131,
20
+ "train_steps_per_second": 0.071
21
+ }
eval_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_logits/chosen": -0.9538615942001343,
4
+ "eval_logits/rejected": -0.9195562601089478,
5
+ "eval_logps/chosen": -369.7508544921875,
6
+ "eval_logps/rejected": -379.8497314453125,
7
+ "eval_loss": 0.03875432908535004,
8
+ "eval_rewards/accuracies": 0.6069999933242798,
9
+ "eval_rewards/chosen": 0.02655443549156189,
10
+ "eval_rewards/margins": 0.03916873782873154,
11
+ "eval_rewards/rejected": -0.012614301405847073,
12
+ "eval_runtime": 539.9666,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 3.704,
15
+ "eval_steps_per_second": 0.926
16
+ }
runs/Apr24_18-06-13_gpu4-119-5/events.out.tfevents.1713946169.gpu4-119-5.4164190.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c401794718ff39e311ca61a54f70250b574f876ce73fe0e52534fbef9956b144
3
- size 111552
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c41419a5d7825d35d06123f941ef2e00fd6b7364cfd3971c346e41154d7c8570
3
+ size 115710
runs/Apr24_18-06-13_gpu4-119-5/events.out.tfevents.1713968856.gpu4-119-5.4164190.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2ba85ec9de94c1c5352e8c32d4be230ba59d073d2464df6dad9c69552ff59e9
3
+ size 828
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.08419492204701015,
4
+ "train_runtime": 22113.8812,
5
+ "train_samples": 61135,
6
+ "train_samples_per_second": 1.131,
7
+ "train_steps_per_second": 0.071
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,2468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.99968,
5
+ "eval_steps": 100,
6
+ "global_step": 1562,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 3.184713375796179e-08,
14
+ "logits/chosen": -0.9295870065689087,
15
+ "logits/rejected": -0.43873703479766846,
16
+ "logps/chosen": -320.5160827636719,
17
+ "logps/rejected": -293.4969482421875,
18
+ "loss": 0.0436,
19
+ "rewards/accuracies": 0.0,
20
+ "rewards/chosen": 0.0,
21
+ "rewards/margins": 0.0,
22
+ "rewards/rejected": 0.0,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 0.01,
27
+ "learning_rate": 3.1847133757961787e-07,
28
+ "logits/chosen": -0.8594987988471985,
29
+ "logits/rejected": -0.762122631072998,
30
+ "logps/chosen": -363.58154296875,
31
+ "logps/rejected": -320.7466735839844,
32
+ "loss": 0.1005,
33
+ "rewards/accuracies": 0.4444444477558136,
34
+ "rewards/chosen": 0.00017958095122594386,
35
+ "rewards/margins": 0.00031936122104525566,
36
+ "rewards/rejected": -0.0001397802698193118,
37
+ "step": 10
38
+ },
39
+ {
40
+ "epoch": 0.01,
41
+ "learning_rate": 6.369426751592357e-07,
42
+ "logits/chosen": -0.8261665105819702,
43
+ "logits/rejected": -0.8099607229232788,
44
+ "logps/chosen": -329.95257568359375,
45
+ "logps/rejected": -333.58843994140625,
46
+ "loss": 0.0881,
47
+ "rewards/accuracies": 0.3812499940395355,
48
+ "rewards/chosen": 1.5792587873875163e-05,
49
+ "rewards/margins": 9.325530118076131e-05,
50
+ "rewards/rejected": -7.746272603981197e-05,
51
+ "step": 20
52
+ },
53
+ {
54
+ "epoch": 0.02,
55
+ "learning_rate": 9.554140127388537e-07,
56
+ "logits/chosen": -0.8844400644302368,
57
+ "logits/rejected": -0.8693345189094543,
58
+ "logps/chosen": -266.14617919921875,
59
+ "logps/rejected": -239.8477020263672,
60
+ "loss": 0.1076,
61
+ "rewards/accuracies": 0.33125001192092896,
62
+ "rewards/chosen": -0.00019076233729720116,
63
+ "rewards/margins": -0.00011497503146529198,
64
+ "rewards/rejected": -7.578730583190918e-05,
65
+ "step": 30
66
+ },
67
+ {
68
+ "epoch": 0.03,
69
+ "learning_rate": 1.2738853503184715e-06,
70
+ "logits/chosen": -0.9354842305183411,
71
+ "logits/rejected": -0.8044538497924805,
72
+ "logps/chosen": -337.16436767578125,
73
+ "logps/rejected": -294.0509338378906,
74
+ "loss": 0.1014,
75
+ "rewards/accuracies": 0.40625,
76
+ "rewards/chosen": -0.0002183823671657592,
77
+ "rewards/margins": 0.00015057336713653058,
78
+ "rewards/rejected": -0.00036895571975037456,
79
+ "step": 40
80
+ },
81
+ {
82
+ "epoch": 0.03,
83
+ "learning_rate": 1.5923566878980892e-06,
84
+ "logits/chosen": -0.9226251840591431,
85
+ "logits/rejected": -0.7269760966300964,
86
+ "logps/chosen": -291.8105773925781,
87
+ "logps/rejected": -249.9472198486328,
88
+ "loss": 0.0984,
89
+ "rewards/accuracies": 0.35624998807907104,
90
+ "rewards/chosen": -1.6449857866973616e-05,
91
+ "rewards/margins": -4.999707016395405e-05,
92
+ "rewards/rejected": 3.354722139192745e-05,
93
+ "step": 50
94
+ },
95
+ {
96
+ "epoch": 0.04,
97
+ "learning_rate": 1.9108280254777074e-06,
98
+ "logits/chosen": -0.7843996286392212,
99
+ "logits/rejected": -0.8503357768058777,
100
+ "logps/chosen": -314.31842041015625,
101
+ "logps/rejected": -272.9629821777344,
102
+ "loss": 0.0831,
103
+ "rewards/accuracies": 0.375,
104
+ "rewards/chosen": -0.0004891738062724471,
105
+ "rewards/margins": 3.405969255254604e-05,
106
+ "rewards/rejected": -0.0005232334951870143,
107
+ "step": 60
108
+ },
109
+ {
110
+ "epoch": 0.04,
111
+ "learning_rate": 2.229299363057325e-06,
112
+ "logits/chosen": -0.8444937467575073,
113
+ "logits/rejected": -0.8687158823013306,
114
+ "logps/chosen": -346.148193359375,
115
+ "logps/rejected": -314.0673828125,
116
+ "loss": 0.101,
117
+ "rewards/accuracies": 0.4375,
118
+ "rewards/chosen": -0.0008814434404484928,
119
+ "rewards/margins": 0.0003942731418646872,
120
+ "rewards/rejected": -0.00127571658231318,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.05,
125
+ "learning_rate": 2.547770700636943e-06,
126
+ "logits/chosen": -0.7672846913337708,
127
+ "logits/rejected": -0.8321496844291687,
128
+ "logps/chosen": -308.66973876953125,
129
+ "logps/rejected": -290.4983215332031,
130
+ "loss": 0.0897,
131
+ "rewards/accuracies": 0.375,
132
+ "rewards/chosen": -0.0009476385894231498,
133
+ "rewards/margins": 0.0005286627565510571,
134
+ "rewards/rejected": -0.0014763014623895288,
135
+ "step": 80
136
+ },
137
+ {
138
+ "epoch": 0.06,
139
+ "learning_rate": 2.8662420382165605e-06,
140
+ "logits/chosen": -0.9202607870101929,
141
+ "logits/rejected": -0.6941283345222473,
142
+ "logps/chosen": -294.6612854003906,
143
+ "logps/rejected": -279.28497314453125,
144
+ "loss": 0.0967,
145
+ "rewards/accuracies": 0.4124999940395355,
146
+ "rewards/chosen": -0.0010531718144193292,
147
+ "rewards/margins": 0.0009010445210151374,
148
+ "rewards/rejected": -0.0019542162772268057,
149
+ "step": 90
150
+ },
151
+ {
152
+ "epoch": 0.06,
153
+ "learning_rate": 3.1847133757961785e-06,
154
+ "logits/chosen": -0.8830461502075195,
155
+ "logits/rejected": -0.7728676795959473,
156
+ "logps/chosen": -346.04144287109375,
157
+ "logps/rejected": -351.6465759277344,
158
+ "loss": 0.098,
159
+ "rewards/accuracies": 0.4312500059604645,
160
+ "rewards/chosen": -0.0019343973835930228,
161
+ "rewards/margins": 0.0005748984985984862,
162
+ "rewards/rejected": -0.00250929594039917,
163
+ "step": 100
164
+ },
165
+ {
166
+ "epoch": 0.06,
167
+ "eval_logits/chosen": -0.8170500993728638,
168
+ "eval_logits/rejected": -0.7225118279457092,
169
+ "eval_logps/chosen": -399.25030517578125,
170
+ "eval_logps/rejected": -370.84332275390625,
171
+ "eval_loss": 0.053326018154621124,
172
+ "eval_rewards/accuracies": 0.49799999594688416,
173
+ "eval_rewards/chosen": -0.002945071319118142,
174
+ "eval_rewards/margins": 0.0006628122646361589,
175
+ "eval_rewards/rejected": -0.0036078833509236574,
176
+ "eval_runtime": 539.8002,
177
+ "eval_samples_per_second": 3.705,
178
+ "eval_steps_per_second": 0.926,
179
+ "step": 100
180
+ },
181
+ {
182
+ "epoch": 0.07,
183
+ "learning_rate": 3.5031847133757964e-06,
184
+ "logits/chosen": -0.8748549222946167,
185
+ "logits/rejected": -0.8573201298713684,
186
+ "logps/chosen": -356.620361328125,
187
+ "logps/rejected": -334.7717590332031,
188
+ "loss": 0.0783,
189
+ "rewards/accuracies": 0.46875,
190
+ "rewards/chosen": -0.0018858186667785048,
191
+ "rewards/margins": 0.0009479810250923038,
192
+ "rewards/rejected": -0.0028337999247014523,
193
+ "step": 110
194
+ },
195
+ {
196
+ "epoch": 0.08,
197
+ "learning_rate": 3.821656050955415e-06,
198
+ "logits/chosen": -0.9678497314453125,
199
+ "logits/rejected": -0.7722519636154175,
200
+ "logps/chosen": -313.03741455078125,
201
+ "logps/rejected": -280.0140686035156,
202
+ "loss": 0.1122,
203
+ "rewards/accuracies": 0.35624998807907104,
204
+ "rewards/chosen": -0.0022745749447494745,
205
+ "rewards/margins": 0.0005548128974623978,
206
+ "rewards/rejected": -0.0028293877840042114,
207
+ "step": 120
208
+ },
209
+ {
210
+ "epoch": 0.08,
211
+ "learning_rate": 4.140127388535032e-06,
212
+ "logits/chosen": -0.8600385785102844,
213
+ "logits/rejected": -0.8846877813339233,
214
+ "logps/chosen": -355.20513916015625,
215
+ "logps/rejected": -346.33135986328125,
216
+ "loss": 0.098,
217
+ "rewards/accuracies": 0.3812499940395355,
218
+ "rewards/chosen": -0.005060167517513037,
219
+ "rewards/margins": -0.00011312137212371454,
220
+ "rewards/rejected": -0.004947046283632517,
221
+ "step": 130
222
+ },
223
+ {
224
+ "epoch": 0.09,
225
+ "learning_rate": 4.45859872611465e-06,
226
+ "logits/chosen": -0.848353385925293,
227
+ "logits/rejected": -0.8213680386543274,
228
+ "logps/chosen": -355.6943054199219,
229
+ "logps/rejected": -313.1587829589844,
230
+ "loss": 0.0975,
231
+ "rewards/accuracies": 0.4749999940395355,
232
+ "rewards/chosen": -0.005685682408511639,
233
+ "rewards/margins": 0.0019586696289479733,
234
+ "rewards/rejected": -0.007644351571798325,
235
+ "step": 140
236
+ },
237
+ {
238
+ "epoch": 0.1,
239
+ "learning_rate": 4.777070063694268e-06,
240
+ "logits/chosen": -0.9287412762641907,
241
+ "logits/rejected": -0.8769465684890747,
242
+ "logps/chosen": -329.2186584472656,
243
+ "logps/rejected": -298.8802795410156,
244
+ "loss": 0.0712,
245
+ "rewards/accuracies": 0.4749999940395355,
246
+ "rewards/chosen": -0.00595915038138628,
247
+ "rewards/margins": 0.0024761247914284468,
248
+ "rewards/rejected": -0.00843527540564537,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.1,
253
+ "learning_rate": 4.999943753177818e-06,
254
+ "logits/chosen": -1.005480170249939,
255
+ "logits/rejected": -0.9711716771125793,
256
+ "logps/chosen": -311.71209716796875,
257
+ "logps/rejected": -299.3721923828125,
258
+ "loss": 0.0886,
259
+ "rewards/accuracies": 0.4375,
260
+ "rewards/chosen": -0.00825573317706585,
261
+ "rewards/margins": 0.0037352764047682285,
262
+ "rewards/rejected": -0.011991010047495365,
263
+ "step": 160
264
+ },
265
+ {
266
+ "epoch": 0.11,
267
+ "learning_rate": 4.998943880079481e-06,
268
+ "logits/chosen": -0.9299044609069824,
269
+ "logits/rejected": -0.9373615384101868,
270
+ "logps/chosen": -343.8020324707031,
271
+ "logps/rejected": -338.0310974121094,
272
+ "loss": 0.0924,
273
+ "rewards/accuracies": 0.4749999940395355,
274
+ "rewards/chosen": -0.01192331500351429,
275
+ "rewards/margins": 0.0040515875443816185,
276
+ "rewards/rejected": -0.015974899753928185,
277
+ "step": 170
278
+ },
279
+ {
280
+ "epoch": 0.12,
281
+ "learning_rate": 4.99669465299937e-06,
282
+ "logits/chosen": -0.9855674505233765,
283
+ "logits/rejected": -0.9634091258049011,
284
+ "logps/chosen": -309.28851318359375,
285
+ "logps/rejected": -315.94561767578125,
286
+ "loss": 0.0928,
287
+ "rewards/accuracies": 0.45625001192092896,
288
+ "rewards/chosen": -0.015265477821230888,
289
+ "rewards/margins": 0.005659168586134911,
290
+ "rewards/rejected": -0.0209246464073658,
291
+ "step": 180
292
+ },
293
+ {
294
+ "epoch": 0.12,
295
+ "learning_rate": 4.993197196444851e-06,
296
+ "logits/chosen": -1.0295236110687256,
297
+ "logits/rejected": -1.1208027601242065,
298
+ "logps/chosen": -317.75494384765625,
299
+ "logps/rejected": -328.9816589355469,
300
+ "loss": 0.0805,
301
+ "rewards/accuracies": 0.44999998807907104,
302
+ "rewards/chosen": -0.020896239206194878,
303
+ "rewards/margins": 0.007452984340488911,
304
+ "rewards/rejected": -0.028349224478006363,
305
+ "step": 190
306
+ },
307
+ {
308
+ "epoch": 0.13,
309
+ "learning_rate": 4.988453258979111e-06,
310
+ "logits/chosen": -1.1082799434661865,
311
+ "logits/rejected": -1.0890588760375977,
312
+ "logps/chosen": -350.9955139160156,
313
+ "logps/rejected": -370.35955810546875,
314
+ "loss": 0.094,
315
+ "rewards/accuracies": 0.518750011920929,
316
+ "rewards/chosen": -0.02722758986055851,
317
+ "rewards/margins": 0.009304001927375793,
318
+ "rewards/rejected": -0.036531589925289154,
319
+ "step": 200
320
+ },
321
+ {
322
+ "epoch": 0.13,
323
+ "eval_logits/chosen": -1.1387817859649658,
324
+ "eval_logits/rejected": -1.0754390954971313,
325
+ "eval_logps/chosen": -435.2693176269531,
326
+ "eval_logps/rejected": -419.6948547363281,
327
+ "eval_loss": 0.04912900552153587,
328
+ "eval_rewards/accuracies": 0.5525000095367432,
329
+ "eval_rewards/chosen": -0.03896407037973404,
330
+ "eval_rewards/margins": 0.013495376333594322,
331
+ "eval_rewards/rejected": -0.05245944485068321,
332
+ "eval_runtime": 539.703,
333
+ "eval_samples_per_second": 3.706,
334
+ "eval_steps_per_second": 0.926,
335
+ "step": 200
336
+ },
337
+ {
338
+ "epoch": 0.13,
339
+ "learning_rate": 4.982465212346954e-06,
340
+ "logits/chosen": -1.1744426488876343,
341
+ "logits/rejected": -1.174753189086914,
342
+ "logps/chosen": -324.98370361328125,
343
+ "logps/rejected": -325.42767333984375,
344
+ "loss": 0.1019,
345
+ "rewards/accuracies": 0.48124998807907104,
346
+ "rewards/chosen": -0.031165916472673416,
347
+ "rewards/margins": 0.018221553415060043,
348
+ "rewards/rejected": -0.04938746988773346,
349
+ "step": 210
350
+ },
351
+ {
352
+ "epoch": 0.14,
353
+ "learning_rate": 4.975236050289041e-06,
354
+ "logits/chosen": -1.1808674335479736,
355
+ "logits/rejected": -1.0834966897964478,
356
+ "logps/chosen": -334.90087890625,
357
+ "logps/rejected": -318.37060546875,
358
+ "loss": 0.0697,
359
+ "rewards/accuracies": 0.48124998807907104,
360
+ "rewards/chosen": -0.028991717845201492,
361
+ "rewards/margins": 0.01802077516913414,
362
+ "rewards/rejected": -0.04701249301433563,
363
+ "step": 220
364
+ },
365
+ {
366
+ "epoch": 0.15,
367
+ "learning_rate": 4.96676938704516e-06,
368
+ "logits/chosen": -1.1029061079025269,
369
+ "logits/rejected": -1.1640408039093018,
370
+ "logps/chosen": -435.73687744140625,
371
+ "logps/rejected": -423.72900390625,
372
+ "loss": 0.0749,
373
+ "rewards/accuracies": 0.44999998807907104,
374
+ "rewards/chosen": -0.040695372968912125,
375
+ "rewards/margins": 0.014833291992545128,
376
+ "rewards/rejected": -0.0555286630988121,
377
+ "step": 230
378
+ },
379
+ {
380
+ "epoch": 0.15,
381
+ "learning_rate": 4.95706945554728e-06,
382
+ "logits/chosen": -1.1478805541992188,
383
+ "logits/rejected": -1.2186776399612427,
384
+ "logps/chosen": -370.34503173828125,
385
+ "logps/rejected": -392.4799499511719,
386
+ "loss": 0.0734,
387
+ "rewards/accuracies": 0.4749999940395355,
388
+ "rewards/chosen": -0.0380186066031456,
389
+ "rewards/margins": 0.017158757895231247,
390
+ "rewards/rejected": -0.05517736077308655,
391
+ "step": 240
392
+ },
393
+ {
394
+ "epoch": 0.16,
395
+ "learning_rate": 4.9461411053032805e-06,
396
+ "logits/chosen": -1.1441445350646973,
397
+ "logits/rejected": -1.1573059558868408,
398
+ "logps/chosen": -357.66925048828125,
399
+ "logps/rejected": -347.88848876953125,
400
+ "loss": 0.1096,
401
+ "rewards/accuracies": 0.4312500059604645,
402
+ "rewards/chosen": -0.03414061293005943,
403
+ "rewards/margins": 0.009822173975408077,
404
+ "rewards/rejected": -0.043962787836790085,
405
+ "step": 250
406
+ },
407
+ {
408
+ "epoch": 0.17,
409
+ "learning_rate": 4.933989799972431e-06,
410
+ "logits/chosen": -1.1844251155853271,
411
+ "logits/rejected": -1.1023800373077393,
412
+ "logps/chosen": -362.12890625,
413
+ "logps/rejected": -344.55694580078125,
414
+ "loss": 0.087,
415
+ "rewards/accuracies": 0.4937500059604645,
416
+ "rewards/chosen": -0.023841477930545807,
417
+ "rewards/margins": 0.01648232527077198,
418
+ "rewards/rejected": -0.04032380133867264,
419
+ "step": 260
420
+ },
421
+ {
422
+ "epoch": 0.17,
423
+ "learning_rate": 4.920621614633815e-06,
424
+ "logits/chosen": -1.158454418182373,
425
+ "logits/rejected": -1.1034467220306396,
426
+ "logps/chosen": -350.0291748046875,
427
+ "logps/rejected": -357.4220886230469,
428
+ "loss": 0.094,
429
+ "rewards/accuracies": 0.46875,
430
+ "rewards/chosen": -0.02827698551118374,
431
+ "rewards/margins": 0.01563875935971737,
432
+ "rewards/rejected": -0.04391574487090111,
433
+ "step": 270
434
+ },
435
+ {
436
+ "epoch": 0.18,
437
+ "learning_rate": 4.906043232749081e-06,
438
+ "logits/chosen": -1.1796420812606812,
439
+ "logits/rejected": -0.9934653043746948,
440
+ "logps/chosen": -337.7796325683594,
441
+ "logps/rejected": -324.3747863769531,
442
+ "loss": 0.0808,
443
+ "rewards/accuracies": 0.45625001192092896,
444
+ "rewards/chosen": -0.02835283800959587,
445
+ "rewards/margins": 0.013374457135796547,
446
+ "rewards/rejected": -0.04172729700803757,
447
+ "step": 280
448
+ },
449
+ {
450
+ "epoch": 0.19,
451
+ "learning_rate": 4.890261942821023e-06,
452
+ "logits/chosen": -1.0947716236114502,
453
+ "logits/rejected": -1.134075403213501,
454
+ "logps/chosen": -331.79107666015625,
455
+ "logps/rejected": -309.49609375,
456
+ "loss": 0.1014,
457
+ "rewards/accuracies": 0.46875,
458
+ "rewards/chosen": -0.024125058203935623,
459
+ "rewards/margins": 0.01503480039536953,
460
+ "rewards/rejected": -0.0391598604619503,
461
+ "step": 290
462
+ },
463
+ {
464
+ "epoch": 0.19,
465
+ "learning_rate": 4.873285634749678e-06,
466
+ "logits/chosen": -1.1100094318389893,
467
+ "logits/rejected": -1.0816996097564697,
468
+ "logps/chosen": -311.70867919921875,
469
+ "logps/rejected": -315.1243591308594,
470
+ "loss": 0.0898,
471
+ "rewards/accuracies": 0.4625000059604645,
472
+ "rewards/chosen": -0.017298461869359016,
473
+ "rewards/margins": 0.013444353826344013,
474
+ "rewards/rejected": -0.030742818489670753,
475
+ "step": 300
476
+ },
477
+ {
478
+ "epoch": 0.19,
479
+ "eval_logits/chosen": -1.0857516527175903,
480
+ "eval_logits/rejected": -1.0291430950164795,
481
+ "eval_logps/chosen": -414.7480163574219,
482
+ "eval_logps/rejected": -407.5087585449219,
483
+ "eval_loss": 0.045198723673820496,
484
+ "eval_rewards/accuracies": 0.578000009059906,
485
+ "eval_rewards/chosen": -0.018442772328853607,
486
+ "eval_rewards/margins": 0.021830614656209946,
487
+ "eval_rewards/rejected": -0.04027338698506355,
488
+ "eval_runtime": 539.5827,
489
+ "eval_samples_per_second": 3.707,
490
+ "eval_steps_per_second": 0.927,
491
+ "step": 300
492
+ },
493
+ {
494
+ "epoch": 0.2,
495
+ "learning_rate": 4.855122795887746e-06,
496
+ "logits/chosen": -1.1289124488830566,
497
+ "logits/rejected": -1.06770658493042,
498
+ "logps/chosen": -297.93218994140625,
499
+ "logps/rejected": -279.66107177734375,
500
+ "loss": 0.0967,
501
+ "rewards/accuracies": 0.5062500238418579,
502
+ "rewards/chosen": -0.0071363793686032295,
503
+ "rewards/margins": 0.01606178656220436,
504
+ "rewards/rejected": -0.023198166862130165,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 0.2,
509
+ "learning_rate": 4.83578250679731e-06,
510
+ "logits/chosen": -1.0398236513137817,
511
+ "logits/rejected": -1.0716559886932373,
512
+ "logps/chosen": -323.6569519042969,
513
+ "logps/rejected": -321.41845703125,
514
+ "loss": 0.1007,
515
+ "rewards/accuracies": 0.5,
516
+ "rewards/chosen": -0.01684439741075039,
517
+ "rewards/margins": 0.016590449959039688,
518
+ "rewards/rejected": -0.03343484550714493,
519
+ "step": 320
520
+ },
521
+ {
522
+ "epoch": 0.21,
523
+ "learning_rate": 4.8152744367099935e-06,
524
+ "logits/chosen": -1.13577139377594,
525
+ "logits/rejected": -1.0597126483917236,
526
+ "logps/chosen": -253.326171875,
527
+ "logps/rejected": -260.37652587890625,
528
+ "loss": 0.1324,
529
+ "rewards/accuracies": 0.45625001192092896,
530
+ "rewards/chosen": -0.011873602867126465,
531
+ "rewards/margins": 0.021937990561127663,
532
+ "rewards/rejected": -0.03381159156560898,
533
+ "step": 330
534
+ },
535
+ {
536
+ "epoch": 0.22,
537
+ "learning_rate": 4.793608838692792e-06,
538
+ "logits/chosen": -1.05599844455719,
539
+ "logits/rejected": -1.1052017211914062,
540
+ "logps/chosen": -330.0927734375,
541
+ "logps/rejected": -354.5337829589844,
542
+ "loss": 0.0709,
543
+ "rewards/accuracies": 0.4312500059604645,
544
+ "rewards/chosen": -0.014163943938910961,
545
+ "rewards/margins": 0.0242028646171093,
546
+ "rewards/rejected": -0.038366809487342834,
547
+ "step": 340
548
+ },
549
+ {
550
+ "epoch": 0.22,
551
+ "learning_rate": 4.770796544522026e-06,
552
+ "logits/chosen": -1.1162309646606445,
553
+ "logits/rejected": -1.0977243185043335,
554
+ "logps/chosen": -298.46929931640625,
555
+ "logps/rejected": -301.0343017578125,
556
+ "loss": 0.0828,
557
+ "rewards/accuracies": 0.4000000059604645,
558
+ "rewards/chosen": -0.014557460322976112,
559
+ "rewards/margins": 0.010886872187256813,
560
+ "rewards/rejected": -0.025444332510232925,
561
+ "step": 350
562
+ },
563
+ {
564
+ "epoch": 0.23,
565
+ "learning_rate": 4.746848959267968e-06,
566
+ "logits/chosen": -1.0554795265197754,
567
+ "logits/rejected": -0.9742960929870605,
568
+ "logps/chosen": -337.4122314453125,
569
+ "logps/rejected": -334.58367919921875,
570
+ "loss": 0.0999,
571
+ "rewards/accuracies": 0.46875,
572
+ "rewards/chosen": -0.002840764820575714,
573
+ "rewards/margins": 0.01675793156027794,
574
+ "rewards/rejected": -0.019598694518208504,
575
+ "step": 360
576
+ },
577
+ {
578
+ "epoch": 0.24,
579
+ "learning_rate": 4.721778055592841e-06,
580
+ "logits/chosen": -0.995721161365509,
581
+ "logits/rejected": -1.009927749633789,
582
+ "logps/chosen": -307.5436706542969,
583
+ "logps/rejected": -305.3434753417969,
584
+ "loss": 0.0958,
585
+ "rewards/accuracies": 0.48124998807907104,
586
+ "rewards/chosen": 0.0018125835340470076,
587
+ "rewards/margins": 0.01936521753668785,
588
+ "rewards/rejected": -0.017552632838487625,
589
+ "step": 370
590
+ },
591
+ {
592
+ "epoch": 0.24,
593
+ "learning_rate": 4.695596367765054e-06,
594
+ "logits/chosen": -0.9923852682113647,
595
+ "logits/rejected": -0.9759656190872192,
596
+ "logps/chosen": -369.8957824707031,
597
+ "logps/rejected": -335.42694091796875,
598
+ "loss": 0.0605,
599
+ "rewards/accuracies": 0.4625000059604645,
600
+ "rewards/chosen": -0.0068010808899998665,
601
+ "rewards/margins": 0.011534233577549458,
602
+ "rewards/rejected": -0.018335314467549324,
603
+ "step": 380
604
+ },
605
+ {
606
+ "epoch": 0.25,
607
+ "learning_rate": 4.6683169853926615e-06,
608
+ "logits/chosen": -1.056213617324829,
609
+ "logits/rejected": -1.1043860912322998,
610
+ "logps/chosen": -302.78570556640625,
611
+ "logps/rejected": -315.38604736328125,
612
+ "loss": 0.0863,
613
+ "rewards/accuracies": 0.5,
614
+ "rewards/chosen": -0.00549090001732111,
615
+ "rewards/margins": 0.01604645326733589,
616
+ "rewards/rejected": -0.021537352353334427,
617
+ "step": 390
618
+ },
619
+ {
620
+ "epoch": 0.26,
621
+ "learning_rate": 4.639953546879173e-06,
622
+ "logits/chosen": -1.011988639831543,
623
+ "logits/rejected": -0.9677278399467468,
624
+ "logps/chosen": -352.2360534667969,
625
+ "logps/rejected": -355.6038513183594,
626
+ "loss": 0.0731,
627
+ "rewards/accuracies": 0.5375000238418579,
628
+ "rewards/chosen": -0.0009747882140800357,
629
+ "rewards/margins": 0.02269767038524151,
630
+ "rewards/rejected": -0.02367245778441429,
631
+ "step": 400
632
+ },
633
+ {
634
+ "epoch": 0.26,
635
+ "eval_logits/chosen": -1.0412452220916748,
636
+ "eval_logits/rejected": -0.9863678812980652,
637
+ "eval_logps/chosen": -403.1915588378906,
638
+ "eval_logps/rejected": -400.2978820800781,
639
+ "eval_loss": 0.04299690201878548,
640
+ "eval_rewards/accuracies": 0.597000002861023,
641
+ "eval_rewards/chosen": -0.006886274088174105,
642
+ "eval_rewards/margins": 0.026176199316978455,
643
+ "eval_rewards/rejected": -0.03306247293949127,
644
+ "eval_runtime": 539.7275,
645
+ "eval_samples_per_second": 3.706,
646
+ "eval_steps_per_second": 0.926,
647
+ "step": 400
648
+ },
649
+ {
650
+ "epoch": 0.26,
651
+ "learning_rate": 4.610520232605e-06,
652
+ "logits/chosen": -1.0571563243865967,
653
+ "logits/rejected": -0.9747453927993774,
654
+ "logps/chosen": -360.03936767578125,
655
+ "logps/rejected": -312.3787536621094,
656
+ "loss": 0.0706,
657
+ "rewards/accuracies": 0.4749999940395355,
658
+ "rewards/chosen": -0.007904710248112679,
659
+ "rewards/margins": 0.012304016388952732,
660
+ "rewards/rejected": -0.020208725705742836,
661
+ "step": 410
662
+ },
663
+ {
664
+ "epoch": 0.27,
665
+ "learning_rate": 4.580031757837931e-06,
666
+ "logits/chosen": -1.0687059164047241,
667
+ "logits/rejected": -1.028510570526123,
668
+ "logps/chosen": -309.14532470703125,
669
+ "logps/rejected": -316.378662109375,
670
+ "loss": 0.0713,
671
+ "rewards/accuracies": 0.4749999940395355,
672
+ "rewards/chosen": -0.0021046344190835953,
673
+ "rewards/margins": 0.0198093019425869,
674
+ "rewards/rejected": -0.021913940086960793,
675
+ "step": 420
676
+ },
677
+ {
678
+ "epoch": 0.28,
679
+ "learning_rate": 4.5485033653761936e-06,
680
+ "logits/chosen": -1.0271378755569458,
681
+ "logits/rejected": -1.0654436349868774,
682
+ "logps/chosen": -289.8176574707031,
683
+ "logps/rejected": -310.6886291503906,
684
+ "loss": 0.0907,
685
+ "rewards/accuracies": 0.5,
686
+ "rewards/chosen": 0.001297330716624856,
687
+ "rewards/margins": 0.02493324503302574,
688
+ "rewards/rejected": -0.02363591641187668,
689
+ "step": 430
690
+ },
691
+ {
692
+ "epoch": 0.28,
693
+ "learning_rate": 4.5159508179277775e-06,
694
+ "logits/chosen": -1.0649276971817017,
695
+ "logits/rejected": -1.0275962352752686,
696
+ "logps/chosen": -287.9720458984375,
697
+ "logps/rejected": -285.8517761230469,
698
+ "loss": 0.0857,
699
+ "rewards/accuracies": 0.5062500238418579,
700
+ "rewards/chosen": 0.0025724810548126698,
701
+ "rewards/margins": 0.02007809281349182,
702
+ "rewards/rejected": -0.01750561222434044,
703
+ "step": 440
704
+ },
705
+ {
706
+ "epoch": 0.29,
707
+ "learning_rate": 4.48239039022982e-06,
708
+ "logits/chosen": -1.0931072235107422,
709
+ "logits/rejected": -0.9774026870727539,
710
+ "logps/chosen": -302.41497802734375,
711
+ "logps/rejected": -302.8403625488281,
712
+ "loss": 0.0786,
713
+ "rewards/accuracies": 0.48750001192092896,
714
+ "rewards/chosen": 0.0017650052905082703,
715
+ "rewards/margins": 0.017620224505662918,
716
+ "rewards/rejected": -0.015855219215154648,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 0.29,
721
+ "learning_rate": 4.447838860912011e-06,
722
+ "logits/chosen": -1.0981853008270264,
723
+ "logits/rejected": -0.9999715685844421,
724
+ "logps/chosen": -287.58599853515625,
725
+ "logps/rejected": -293.4311218261719,
726
+ "loss": 0.0808,
727
+ "rewards/accuracies": 0.38749998807907104,
728
+ "rewards/chosen": -0.007796216756105423,
729
+ "rewards/margins": 0.02398526482284069,
730
+ "rewards/rejected": -0.03178148344159126,
731
+ "step": 460
732
+ },
733
+ {
734
+ "epoch": 0.3,
735
+ "learning_rate": 4.412313504108062e-06,
736
+ "logits/chosen": -1.1234943866729736,
737
+ "logits/rejected": -1.0496309995651245,
738
+ "logps/chosen": -343.08856201171875,
739
+ "logps/rejected": -333.7749938964844,
740
+ "loss": 0.0816,
741
+ "rewards/accuracies": 0.5,
742
+ "rewards/chosen": -0.00918793398886919,
743
+ "rewards/margins": 0.024025408551096916,
744
+ "rewards/rejected": -0.03321333974599838,
745
+ "step": 470
746
+ },
747
+ {
748
+ "epoch": 0.31,
749
+ "learning_rate": 4.375832080819465e-06,
750
+ "logits/chosen": -1.1054179668426514,
751
+ "logits/rejected": -1.063127040863037,
752
+ "logps/chosen": -352.350341796875,
753
+ "logps/rejected": -334.8752746582031,
754
+ "loss": 0.0853,
755
+ "rewards/accuracies": 0.48124998807907104,
756
+ "rewards/chosen": -0.00490077119320631,
757
+ "rewards/margins": 0.02163395844399929,
758
+ "rewards/rejected": -0.026534726843237877,
759
+ "step": 480
760
+ },
761
+ {
762
+ "epoch": 0.31,
763
+ "learning_rate": 4.338412830035823e-06,
764
+ "logits/chosen": -1.0628612041473389,
765
+ "logits/rejected": -1.0398916006088257,
766
+ "logps/chosen": -347.806884765625,
767
+ "logps/rejected": -326.5227355957031,
768
+ "loss": 0.0819,
769
+ "rewards/accuracies": 0.5062500238418579,
770
+ "rewards/chosen": 0.0002868110022973269,
771
+ "rewards/margins": 0.024012237787246704,
772
+ "rewards/rejected": -0.02372542954981327,
773
+ "step": 490
774
+ },
775
+ {
776
+ "epoch": 0.32,
777
+ "learning_rate": 4.300074459616216e-06,
778
+ "logits/chosen": -1.083707332611084,
779
+ "logits/rejected": -1.1182454824447632,
780
+ "logps/chosen": -304.49078369140625,
781
+ "logps/rejected": -335.3125,
782
+ "loss": 0.0787,
783
+ "rewards/accuracies": 0.5625,
784
+ "rewards/chosen": -0.0009085664642043412,
785
+ "rewards/margins": 0.03841399401426315,
786
+ "rewards/rejected": -0.039322562515735626,
787
+ "step": 500
788
+ },
789
+ {
790
+ "epoch": 0.32,
791
+ "eval_logits/chosen": -1.0974537134170532,
792
+ "eval_logits/rejected": -1.0587471723556519,
793
+ "eval_logps/chosen": -408.4565734863281,
794
+ "eval_logps/rejected": -414.48870849609375,
795
+ "eval_loss": 0.0421723797917366,
796
+ "eval_rewards/accuracies": 0.6069999933242798,
797
+ "eval_rewards/chosen": -0.012151296250522137,
798
+ "eval_rewards/margins": 0.0351020023226738,
799
+ "eval_rewards/rejected": -0.04725329577922821,
800
+ "eval_runtime": 539.5972,
801
+ "eval_samples_per_second": 3.706,
802
+ "eval_steps_per_second": 0.927,
803
+ "step": 500
804
+ },
805
+ {
806
+ "epoch": 0.33,
807
+ "learning_rate": 4.260836136936159e-06,
808
+ "logits/chosen": -1.0836219787597656,
809
+ "logits/rejected": -1.0971637964248657,
810
+ "logps/chosen": -287.1617431640625,
811
+ "logps/rejected": -287.58148193359375,
812
+ "loss": 0.0893,
813
+ "rewards/accuracies": 0.4625000059604645,
814
+ "rewards/chosen": -0.0011848447611555457,
815
+ "rewards/margins": 0.033604733645915985,
816
+ "rewards/rejected": -0.03478958457708359,
817
+ "step": 510
818
+ },
819
+ {
820
+ "epoch": 0.33,
821
+ "learning_rate": 4.220717479304816e-06,
822
+ "logits/chosen": -1.0768189430236816,
823
+ "logits/rejected": -1.0736405849456787,
824
+ "logps/chosen": -351.4105529785156,
825
+ "logps/rejected": -374.5068359375,
826
+ "loss": 0.092,
827
+ "rewards/accuracies": 0.550000011920929,
828
+ "rewards/chosen": -0.007005206309258938,
829
+ "rewards/margins": 0.034257300198078156,
830
+ "rewards/rejected": -0.04126249998807907,
831
+ "step": 520
832
+ },
833
+ {
834
+ "epoch": 0.34,
835
+ "learning_rate": 4.179738544157272e-06,
836
+ "logits/chosen": -1.0806314945220947,
837
+ "logits/rejected": -1.0778720378875732,
838
+ "logps/chosen": -275.35101318359375,
839
+ "logps/rejected": -270.8028869628906,
840
+ "loss": 0.0745,
841
+ "rewards/accuracies": 0.4124999940395355,
842
+ "rewards/chosen": 0.0068043931387364864,
843
+ "rewards/margins": 0.013285738416016102,
844
+ "rewards/rejected": -0.006481344345957041,
845
+ "step": 530
846
+ },
847
+ {
848
+ "epoch": 0.35,
849
+ "learning_rate": 4.137919819026762e-06,
850
+ "logits/chosen": -1.0614066123962402,
851
+ "logits/rejected": -0.9721907377243042,
852
+ "logps/chosen": -313.3482971191406,
853
+ "logps/rejected": -321.7149963378906,
854
+ "loss": 0.076,
855
+ "rewards/accuracies": 0.518750011920929,
856
+ "rewards/chosen": 0.009386066347360611,
857
+ "rewards/margins": 0.01596895419061184,
858
+ "rewards/rejected": -0.006582888774573803,
859
+ "step": 540
860
+ },
861
+ {
862
+ "epoch": 0.35,
863
+ "learning_rate": 4.09528221130187e-06,
864
+ "logits/chosen": -1.0457605123519897,
865
+ "logits/rejected": -1.0901412963867188,
866
+ "logps/chosen": -314.82086181640625,
867
+ "logps/rejected": -286.0039367675781,
868
+ "loss": 0.0912,
869
+ "rewards/accuracies": 0.5249999761581421,
870
+ "rewards/chosen": 0.009857202880084515,
871
+ "rewards/margins": 0.024335484951734543,
872
+ "rewards/rejected": -0.014478283934295177,
873
+ "step": 550
874
+ },
875
+ {
876
+ "epoch": 0.36,
877
+ "learning_rate": 4.0518470377738274e-06,
878
+ "logits/chosen": -1.0764890909194946,
879
+ "logits/rejected": -1.1156491041183472,
880
+ "logps/chosen": -293.7644348144531,
881
+ "logps/rejected": -315.3866882324219,
882
+ "loss": 0.0867,
883
+ "rewards/accuracies": 0.48124998807907104,
884
+ "rewards/chosen": 0.011628219857811928,
885
+ "rewards/margins": 0.028168832883238792,
886
+ "rewards/rejected": -0.016540613025426865,
887
+ "step": 560
888
+ },
889
+ {
890
+ "epoch": 0.36,
891
+ "learning_rate": 4.0076360139791155e-06,
892
+ "logits/chosen": -1.0172516107559204,
893
+ "logits/rejected": -1.0550997257232666,
894
+ "logps/chosen": -340.00164794921875,
895
+ "logps/rejected": -341.64068603515625,
896
+ "loss": 0.0722,
897
+ "rewards/accuracies": 0.5625,
898
+ "rewards/chosen": 0.017406892031431198,
899
+ "rewards/margins": 0.03364910930395126,
900
+ "rewards/rejected": -0.016242217272520065,
901
+ "step": 570
902
+ },
903
+ {
904
+ "epoch": 0.37,
905
+ "learning_rate": 3.962671243342728e-06,
906
+ "logits/chosen": -1.089564323425293,
907
+ "logits/rejected": -1.038475513458252,
908
+ "logps/chosen": -314.32012939453125,
909
+ "logps/rejected": -327.6108703613281,
910
+ "loss": 0.0795,
911
+ "rewards/accuracies": 0.46875,
912
+ "rewards/chosen": 0.00651575718075037,
913
+ "rewards/margins": 0.018204618245363235,
914
+ "rewards/rejected": -0.011688861064612865,
915
+ "step": 580
916
+ },
917
+ {
918
+ "epoch": 0.38,
919
+ "learning_rate": 3.916975206127501e-06,
920
+ "logits/chosen": -1.0772429704666138,
921
+ "logits/rejected": -1.0009437799453735,
922
+ "logps/chosen": -317.83563232421875,
923
+ "logps/rejected": -302.42535400390625,
924
+ "loss": 0.0735,
925
+ "rewards/accuracies": 0.5249999761581421,
926
+ "rewards/chosen": 0.006415791809558868,
927
+ "rewards/margins": 0.03005843423306942,
928
+ "rewards/rejected": -0.0236426442861557,
929
+ "step": 590
930
+ },
931
+ {
932
+ "epoch": 0.38,
933
+ "learning_rate": 3.870570748195039e-06,
934
+ "logits/chosen": -1.1022416353225708,
935
+ "logits/rejected": -1.1078603267669678,
936
+ "logps/chosen": -338.0223083496094,
937
+ "logps/rejected": -388.9841003417969,
938
+ "loss": 0.0742,
939
+ "rewards/accuracies": 0.5687500238418579,
940
+ "rewards/chosen": 0.012317690066993237,
941
+ "rewards/margins": 0.049174439162015915,
942
+ "rewards/rejected": -0.036856748163700104,
943
+ "step": 600
944
+ },
945
+ {
946
+ "epoch": 0.38,
947
+ "eval_logits/chosen": -1.0245832204818726,
948
+ "eval_logits/rejected": -0.9872007966041565,
949
+ "eval_logps/chosen": -382.8363342285156,
950
+ "eval_logps/rejected": -384.7105407714844,
951
+ "eval_loss": 0.04055745154619217,
952
+ "eval_rewards/accuracies": 0.6085000038146973,
953
+ "eval_rewards/chosen": 0.013468942604959011,
954
+ "eval_rewards/margins": 0.030944030731916428,
955
+ "eval_rewards/rejected": -0.017475087195634842,
956
+ "eval_runtime": 539.6208,
957
+ "eval_samples_per_second": 3.706,
958
+ "eval_steps_per_second": 0.927,
959
+ "step": 600
960
+ },
961
+ {
962
+ "epoch": 0.39,
963
+ "learning_rate": 3.823481069583869e-06,
964
+ "logits/chosen": -1.0397017002105713,
965
+ "logits/rejected": -0.9971572756767273,
966
+ "logps/chosen": -305.09442138671875,
967
+ "logps/rejected": -314.68621826171875,
968
+ "loss": 0.08,
969
+ "rewards/accuracies": 0.5,
970
+ "rewards/chosen": 0.01682308129966259,
971
+ "rewards/margins": 0.026519659906625748,
972
+ "rewards/rejected": -0.009696578606963158,
973
+ "step": 610
974
+ },
975
+ {
976
+ "epoch": 0.4,
977
+ "learning_rate": 3.7757297129105087e-06,
978
+ "logits/chosen": -1.1045721769332886,
979
+ "logits/rejected": -1.0196996927261353,
980
+ "logps/chosen": -272.93206787109375,
981
+ "logps/rejected": -281.6795349121094,
982
+ "loss": 0.0878,
983
+ "rewards/accuracies": 0.4124999940395355,
984
+ "rewards/chosen": 0.009568464942276478,
985
+ "rewards/margins": 0.01820019818842411,
986
+ "rewards/rejected": -0.008631732314825058,
987
+ "step": 620
988
+ },
989
+ {
990
+ "epoch": 0.4,
991
+ "learning_rate": 3.7273405515992785e-06,
992
+ "logits/chosen": -1.0682175159454346,
993
+ "logits/rejected": -1.0213497877120972,
994
+ "logps/chosen": -284.39495849609375,
995
+ "logps/rejected": -300.61920166015625,
996
+ "loss": 0.0707,
997
+ "rewards/accuracies": 0.4937500059604645,
998
+ "rewards/chosen": 0.01212714146822691,
999
+ "rewards/margins": 0.01712757535278797,
1000
+ "rewards/rejected": -0.005000432953238487,
1001
+ "step": 630
1002
+ },
1003
+ {
1004
+ "epoch": 0.41,
1005
+ "learning_rate": 3.678337777946706e-06,
1006
+ "logits/chosen": -1.0374300479888916,
1007
+ "logits/rejected": -0.9567736387252808,
1008
+ "logps/chosen": -334.5086669921875,
1009
+ "logps/rejected": -323.7910461425781,
1010
+ "loss": 0.0941,
1011
+ "rewards/accuracies": 0.5625,
1012
+ "rewards/chosen": 0.02154025062918663,
1013
+ "rewards/margins": 0.038109757006168365,
1014
+ "rewards/rejected": -0.016569510102272034,
1015
+ "step": 640
1016
+ },
1017
+ {
1018
+ "epoch": 0.42,
1019
+ "learning_rate": 3.6287458910265293e-06,
1020
+ "logits/chosen": -1.0035964250564575,
1021
+ "logits/rejected": -0.9258484840393066,
1022
+ "logps/chosen": -309.08038330078125,
1023
+ "logps/rejected": -294.2804260253906,
1024
+ "loss": 0.0892,
1025
+ "rewards/accuracies": 0.4749999940395355,
1026
+ "rewards/chosen": 0.011333728209137917,
1027
+ "rewards/margins": 0.01899079605937004,
1028
+ "rewards/rejected": -0.0076570697128772736,
1029
+ "step": 650
1030
+ },
1031
+ {
1032
+ "epoch": 0.42,
1033
+ "learning_rate": 3.57858968444131e-06,
1034
+ "logits/chosen": -1.0853677988052368,
1035
+ "logits/rejected": -0.9975327253341675,
1036
+ "logps/chosen": -330.30255126953125,
1037
+ "logps/rejected": -312.9807434082031,
1038
+ "loss": 0.0737,
1039
+ "rewards/accuracies": 0.512499988079071,
1040
+ "rewards/chosen": 0.014233666472136974,
1041
+ "rewards/margins": 0.025601008906960487,
1042
+ "rewards/rejected": -0.011367343366146088,
1043
+ "step": 660
1044
+ },
1045
+ {
1046
+ "epoch": 0.43,
1047
+ "learning_rate": 3.5278942339268034e-06,
1048
+ "logits/chosen": -1.0091218948364258,
1049
+ "logits/rejected": -1.0755088329315186,
1050
+ "logps/chosen": -276.8922424316406,
1051
+ "logps/rejected": -313.3753967285156,
1052
+ "loss": 0.0777,
1053
+ "rewards/accuracies": 0.5249999761581421,
1054
+ "rewards/chosen": 0.016201717779040337,
1055
+ "rewards/margins": 0.032345883548259735,
1056
+ "rewards/rejected": -0.016144167631864548,
1057
+ "step": 670
1058
+ },
1059
+ {
1060
+ "epoch": 0.44,
1061
+ "learning_rate": 3.476684884815279e-06,
1062
+ "logits/chosen": -1.0601623058319092,
1063
+ "logits/rejected": -0.9890631437301636,
1064
+ "logps/chosen": -280.58538818359375,
1065
+ "logps/rejected": -280.070068359375,
1066
+ "loss": 0.1082,
1067
+ "rewards/accuracies": 0.375,
1068
+ "rewards/chosen": 0.00014938972890377045,
1069
+ "rewards/margins": 0.00774354999884963,
1070
+ "rewards/rejected": -0.007594159804284573,
1071
+ "step": 680
1072
+ },
1073
+ {
1074
+ "epoch": 0.44,
1075
+ "learning_rate": 3.424987239364044e-06,
1076
+ "logits/chosen": -1.0564385652542114,
1077
+ "logits/rejected": -1.0079165697097778,
1078
+ "logps/chosen": -290.0457458496094,
1079
+ "logps/rejected": -317.35809326171875,
1080
+ "loss": 0.0844,
1081
+ "rewards/accuracies": 0.574999988079071,
1082
+ "rewards/chosen": 0.01850438304245472,
1083
+ "rewards/margins": 0.03815234825015068,
1084
+ "rewards/rejected": -0.01964796707034111,
1085
+ "step": 690
1086
+ },
1087
+ {
1088
+ "epoch": 0.45,
1089
+ "learning_rate": 3.3728271439555277e-06,
1090
+ "logits/chosen": -1.061989426612854,
1091
+ "logits/rejected": -1.0436265468597412,
1092
+ "logps/chosen": -300.4573669433594,
1093
+ "logps/rejected": -291.84320068359375,
1094
+ "loss": 0.0635,
1095
+ "rewards/accuracies": 0.4749999940395355,
1096
+ "rewards/chosen": 0.012897541746497154,
1097
+ "rewards/margins": 0.022769348695874214,
1098
+ "rewards/rejected": -0.009871806018054485,
1099
+ "step": 700
1100
+ },
1101
+ {
1102
+ "epoch": 0.45,
1103
+ "eval_logits/chosen": -1.022524118423462,
1104
+ "eval_logits/rejected": -0.9903200268745422,
1105
+ "eval_logps/chosen": -379.6695556640625,
1106
+ "eval_logps/rejected": -386.0258483886719,
1107
+ "eval_loss": 0.04010883718729019,
1108
+ "eval_rewards/accuracies": 0.609499990940094,
1109
+ "eval_rewards/chosen": 0.01663573645055294,
1110
+ "eval_rewards/margins": 0.035426173359155655,
1111
+ "eval_rewards/rejected": -0.018790436908602715,
1112
+ "eval_runtime": 539.7506,
1113
+ "eval_samples_per_second": 3.705,
1114
+ "eval_steps_per_second": 0.926,
1115
+ "step": 700
1116
+ },
1117
+ {
1118
+ "epoch": 0.45,
1119
+ "learning_rate": 3.3202306761753078e-06,
1120
+ "logits/chosen": -1.056015133857727,
1121
+ "logits/rejected": -1.044135570526123,
1122
+ "logps/chosen": -291.650634765625,
1123
+ "logps/rejected": -287.6162414550781,
1124
+ "loss": 0.0908,
1125
+ "rewards/accuracies": 0.46875,
1126
+ "rewards/chosen": 0.012986931018531322,
1127
+ "rewards/margins": 0.0346546433866024,
1128
+ "rewards/rejected": -0.021667715162038803,
1129
+ "step": 710
1130
+ },
1131
+ {
1132
+ "epoch": 0.46,
1133
+ "learning_rate": 3.2672241317745513e-06,
1134
+ "logits/chosen": -0.9736151695251465,
1135
+ "logits/rejected": -0.9646120071411133,
1136
+ "logps/chosen": -311.49346923828125,
1137
+ "logps/rejected": -341.27496337890625,
1138
+ "loss": 0.0756,
1139
+ "rewards/accuracies": 0.5062500238418579,
1140
+ "rewards/chosen": 0.017372317612171173,
1141
+ "rewards/margins": 0.028918754309415817,
1142
+ "rewards/rejected": -0.011546434834599495,
1143
+ "step": 720
1144
+ },
1145
+ {
1146
+ "epoch": 0.47,
1147
+ "learning_rate": 3.213834011523378e-06,
1148
+ "logits/chosen": -1.1018531322479248,
1149
+ "logits/rejected": -1.025577187538147,
1150
+ "logps/chosen": -319.1675720214844,
1151
+ "logps/rejected": -316.7040100097656,
1152
+ "loss": 0.0765,
1153
+ "rewards/accuracies": 0.543749988079071,
1154
+ "rewards/chosen": 0.014244127087295055,
1155
+ "rewards/margins": 0.03847852349281311,
1156
+ "rewards/rejected": -0.02423439547419548,
1157
+ "step": 730
1158
+ },
1159
+ {
1160
+ "epoch": 0.47,
1161
+ "learning_rate": 3.160087007961724e-06,
1162
+ "logits/chosen": -1.048353910446167,
1163
+ "logits/rejected": -0.990998387336731,
1164
+ "logps/chosen": -310.2376403808594,
1165
+ "logps/rejected": -319.0958557128906,
1166
+ "loss": 0.0929,
1167
+ "rewards/accuracies": 0.4937500059604645,
1168
+ "rewards/chosen": 0.01772814430296421,
1169
+ "rewards/margins": 0.029912665486335754,
1170
+ "rewards/rejected": -0.012184521183371544,
1171
+ "step": 740
1172
+ },
1173
+ {
1174
+ "epoch": 0.48,
1175
+ "learning_rate": 3.1060099920543404e-06,
1176
+ "logits/chosen": -0.9734305143356323,
1177
+ "logits/rejected": -1.053117275238037,
1178
+ "logps/chosen": -253.1719207763672,
1179
+ "logps/rejected": -268.72723388671875,
1180
+ "loss": 0.1126,
1181
+ "rewards/accuracies": 0.44999998807907104,
1182
+ "rewards/chosen": 0.019246799871325493,
1183
+ "rewards/margins": 0.028338003903627396,
1184
+ "rewards/rejected": -0.009091204032301903,
1185
+ "step": 750
1186
+ },
1187
+ {
1188
+ "epoch": 0.49,
1189
+ "learning_rate": 3.0516299997565675e-06,
1190
+ "logits/chosen": -0.9801149368286133,
1191
+ "logits/rejected": -1.053255558013916,
1192
+ "logps/chosen": -300.00152587890625,
1193
+ "logps/rejected": -316.54656982421875,
1194
+ "loss": 0.0736,
1195
+ "rewards/accuracies": 0.518750011920929,
1196
+ "rewards/chosen": 0.013104112818837166,
1197
+ "rewards/margins": 0.0340370312333107,
1198
+ "rewards/rejected": -0.020932912826538086,
1199
+ "step": 760
1200
+ },
1201
+ {
1202
+ "epoch": 0.49,
1203
+ "learning_rate": 2.996974218497643e-06,
1204
+ "logits/chosen": -0.9665297269821167,
1205
+ "logits/rejected": -1.033067226409912,
1206
+ "logps/chosen": -327.8155822753906,
1207
+ "logps/rejected": -291.17840576171875,
1208
+ "loss": 0.0969,
1209
+ "rewards/accuracies": 0.4312500059604645,
1210
+ "rewards/chosen": 0.013064196333289146,
1211
+ "rewards/margins": 0.014280739240348339,
1212
+ "rewards/rejected": -0.0012165403459221125,
1213
+ "step": 770
1214
+ },
1215
+ {
1216
+ "epoch": 0.5,
1217
+ "learning_rate": 2.9420699735882673e-06,
1218
+ "logits/chosen": -1.103208065032959,
1219
+ "logits/rejected": -1.0667420625686646,
1220
+ "logps/chosen": -284.13043212890625,
1221
+ "logps/rejected": -288.5975646972656,
1222
+ "loss": 0.0726,
1223
+ "rewards/accuracies": 0.45625001192092896,
1224
+ "rewards/chosen": 0.021614065393805504,
1225
+ "rewards/margins": 0.022522414103150368,
1226
+ "rewards/rejected": -0.000908347952645272,
1227
+ "step": 780
1228
+ },
1229
+ {
1230
+ "epoch": 0.51,
1231
+ "learning_rate": 2.8869447145592345e-06,
1232
+ "logits/chosen": -0.9419771432876587,
1233
+ "logits/rejected": -0.9063804745674133,
1234
+ "logps/chosen": -275.3219299316406,
1235
+ "logps/rejected": -281.0148620605469,
1236
+ "loss": 0.0885,
1237
+ "rewards/accuracies": 0.4124999940395355,
1238
+ "rewards/chosen": 0.021688418462872505,
1239
+ "rewards/margins": 0.02460251934826374,
1240
+ "rewards/rejected": -0.0029140994884073734,
1241
+ "step": 790
1242
+ },
1243
+ {
1244
+ "epoch": 0.51,
1245
+ "learning_rate": 2.831626001437969e-06,
1246
+ "logits/chosen": -0.9940840601921082,
1247
+ "logits/rejected": -0.9761594533920288,
1248
+ "logps/chosen": -284.7848205566406,
1249
+ "logps/rejected": -305.55279541015625,
1250
+ "loss": 0.0881,
1251
+ "rewards/accuracies": 0.5249999761581421,
1252
+ "rewards/chosen": 0.02503793314099312,
1253
+ "rewards/margins": 0.03414962440729141,
1254
+ "rewards/rejected": -0.009111693128943443,
1255
+ "step": 800
1256
+ },
1257
+ {
1258
+ "epoch": 0.51,
1259
+ "eval_logits/chosen": -0.9974508881568909,
1260
+ "eval_logits/rejected": -0.9657922387123108,
1261
+ "eval_logps/chosen": -371.2671813964844,
1262
+ "eval_logps/rejected": -377.4323425292969,
1263
+ "eval_loss": 0.03950659930706024,
1264
+ "eval_rewards/accuracies": 0.6085000038146973,
1265
+ "eval_rewards/chosen": 0.025038093328475952,
1266
+ "eval_rewards/margins": 0.03523498401045799,
1267
+ "eval_rewards/rejected": -0.010196887888014317,
1268
+ "eval_runtime": 539.4827,
1269
+ "eval_samples_per_second": 3.707,
1270
+ "eval_steps_per_second": 0.927,
1271
+ "step": 800
1272
+ },
1273
+ {
1274
+ "epoch": 0.52,
1275
+ "learning_rate": 2.776141490969808e-06,
1276
+ "logits/chosen": -0.9662303924560547,
1277
+ "logits/rejected": -1.007033109664917,
1278
+ "logps/chosen": -273.5274963378906,
1279
+ "logps/rejected": -297.3504943847656,
1280
+ "loss": 0.0811,
1281
+ "rewards/accuracies": 0.4124999940395355,
1282
+ "rewards/chosen": 0.017237264662981033,
1283
+ "rewards/margins": 0.020813334733247757,
1284
+ "rewards/rejected": -0.0035760707687586546,
1285
+ "step": 810
1286
+ },
1287
+ {
1288
+ "epoch": 0.52,
1289
+ "learning_rate": 2.720518922790937e-06,
1290
+ "logits/chosen": -1.0530205965042114,
1291
+ "logits/rejected": -0.9808292388916016,
1292
+ "logps/chosen": -260.7068176269531,
1293
+ "logps/rejected": -254.8417205810547,
1294
+ "loss": 0.1111,
1295
+ "rewards/accuracies": 0.45625001192092896,
1296
+ "rewards/chosen": 0.015834391117095947,
1297
+ "rewards/margins": 0.030566086992621422,
1298
+ "rewards/rejected": -0.014731695875525475,
1299
+ "step": 820
1300
+ },
1301
+ {
1302
+ "epoch": 0.53,
1303
+ "learning_rate": 2.66478610555988e-06,
1304
+ "logits/chosen": -0.973135769367218,
1305
+ "logits/rejected": -1.0151408910751343,
1306
+ "logps/chosen": -315.51763916015625,
1307
+ "logps/rejected": -324.4259338378906,
1308
+ "loss": 0.0997,
1309
+ "rewards/accuracies": 0.5062500238418579,
1310
+ "rewards/chosen": 0.022444238886237144,
1311
+ "rewards/margins": 0.034130193293094635,
1312
+ "rewards/rejected": -0.011685955338180065,
1313
+ "step": 830
1314
+ },
1315
+ {
1316
+ "epoch": 0.54,
1317
+ "learning_rate": 2.608970903054482e-06,
1318
+ "logits/chosen": -0.9921044111251831,
1319
+ "logits/rejected": -0.996437668800354,
1320
+ "logps/chosen": -298.2916564941406,
1321
+ "logps/rejected": -307.67156982421875,
1322
+ "loss": 0.0765,
1323
+ "rewards/accuracies": 0.512499988079071,
1324
+ "rewards/chosen": 0.020961161702871323,
1325
+ "rewards/margins": 0.020338475704193115,
1326
+ "rewards/rejected": 0.0006226839614100754,
1327
+ "step": 840
1328
+ },
1329
+ {
1330
+ "epoch": 0.54,
1331
+ "learning_rate": 2.553101220241337e-06,
1332
+ "logits/chosen": -0.955792248249054,
1333
+ "logits/rejected": -0.9853283166885376,
1334
+ "logps/chosen": -322.03619384765625,
1335
+ "logps/rejected": -341.2371520996094,
1336
+ "loss": 0.0739,
1337
+ "rewards/accuracies": 0.46875,
1338
+ "rewards/chosen": 0.02565811201930046,
1339
+ "rewards/margins": 0.01996433734893799,
1340
+ "rewards/rejected": 0.005693775601685047,
1341
+ "step": 850
1342
+ },
1343
+ {
1344
+ "epoch": 0.55,
1345
+ "learning_rate": 2.4972049893246218e-06,
1346
+ "logits/chosen": -0.9531941413879395,
1347
+ "logits/rejected": -0.9866918325424194,
1348
+ "logps/chosen": -299.243896484375,
1349
+ "logps/rejected": -302.88336181640625,
1350
+ "loss": 0.0889,
1351
+ "rewards/accuracies": 0.5,
1352
+ "rewards/chosen": 0.02744477614760399,
1353
+ "rewards/margins": 0.03810085728764534,
1354
+ "rewards/rejected": -0.0106560830026865,
1355
+ "step": 860
1356
+ },
1357
+ {
1358
+ "epoch": 0.56,
1359
+ "learning_rate": 2.4413101557813095e-06,
1360
+ "logits/chosen": -0.9942695498466492,
1361
+ "logits/rejected": -1.0122566223144531,
1362
+ "logps/chosen": -291.08892822265625,
1363
+ "logps/rejected": -301.3489074707031,
1364
+ "loss": 0.076,
1365
+ "rewards/accuracies": 0.48124998807907104,
1366
+ "rewards/chosen": 0.027957703918218613,
1367
+ "rewards/margins": 0.030374949797987938,
1368
+ "rewards/rejected": -0.0024172496050596237,
1369
+ "step": 870
1370
+ },
1371
+ {
1372
+ "epoch": 0.56,
1373
+ "learning_rate": 2.3854446643897566e-06,
1374
+ "logits/chosen": -0.935819149017334,
1375
+ "logits/rejected": -0.8908926248550415,
1376
+ "logps/chosen": -274.655517578125,
1377
+ "logps/rejected": -301.8233642578125,
1378
+ "loss": 0.093,
1379
+ "rewards/accuracies": 0.5,
1380
+ "rewards/chosen": 0.02359806001186371,
1381
+ "rewards/margins": 0.040579214692115784,
1382
+ "rewards/rejected": -0.016981154680252075,
1383
+ "step": 880
1384
+ },
1385
+ {
1386
+ "epoch": 0.57,
1387
+ "learning_rate": 2.3296364452586246e-06,
1388
+ "logits/chosen": -0.9394947290420532,
1389
+ "logits/rejected": -1.015801191329956,
1390
+ "logps/chosen": -244.6657257080078,
1391
+ "logps/rejected": -255.62155151367188,
1392
+ "loss": 0.0973,
1393
+ "rewards/accuracies": 0.40625,
1394
+ "rewards/chosen": 0.02046729251742363,
1395
+ "rewards/margins": 0.011587701737880707,
1396
+ "rewards/rejected": 0.008879591710865498,
1397
+ "step": 890
1398
+ },
1399
+ {
1400
+ "epoch": 0.58,
1401
+ "learning_rate": 2.273913399863151e-06,
1402
+ "logits/chosen": -0.9255924224853516,
1403
+ "logits/rejected": -0.9949308633804321,
1404
+ "logps/chosen": -319.0769958496094,
1405
+ "logps/rejected": -335.22125244140625,
1406
+ "loss": 0.0753,
1407
+ "rewards/accuracies": 0.5,
1408
+ "rewards/chosen": 0.0340295247733593,
1409
+ "rewards/margins": 0.03226994723081589,
1410
+ "rewards/rejected": 0.0017595753306522965,
1411
+ "step": 900
1412
+ },
1413
+ {
1414
+ "epoch": 0.58,
1415
+ "eval_logits/chosen": -0.9455928802490234,
1416
+ "eval_logits/rejected": -0.9026016592979431,
1417
+ "eval_logps/chosen": -365.869873046875,
1418
+ "eval_logps/rejected": -371.78717041015625,
1419
+ "eval_loss": 0.03934764117002487,
1420
+ "eval_rewards/accuracies": 0.5989999771118164,
1421
+ "eval_rewards/chosen": 0.0304353516548872,
1422
+ "eval_rewards/margins": 0.03498707711696625,
1423
+ "eval_rewards/rejected": -0.00455172173678875,
1424
+ "eval_runtime": 539.932,
1425
+ "eval_samples_per_second": 3.704,
1426
+ "eval_steps_per_second": 0.926,
1427
+ "step": 900
1428
+ },
1429
+ {
1430
+ "epoch": 0.58,
1431
+ "learning_rate": 2.2183033870957237e-06,
1432
+ "logits/chosen": -0.9681524038314819,
1433
+ "logits/rejected": -0.9325584173202515,
1434
+ "logps/chosen": -338.40325927734375,
1435
+ "logps/rejected": -328.984619140625,
1436
+ "loss": 0.0758,
1437
+ "rewards/accuracies": 0.5249999761581421,
1438
+ "rewards/chosen": 0.017296601086854935,
1439
+ "rewards/margins": 0.03395865857601166,
1440
+ "rewards/rejected": -0.016662055626511574,
1441
+ "step": 910
1442
+ },
1443
+ {
1444
+ "epoch": 0.59,
1445
+ "learning_rate": 2.1628342093377533e-06,
1446
+ "logits/chosen": -0.9232437014579773,
1447
+ "logits/rejected": -0.9427019357681274,
1448
+ "logps/chosen": -301.0130615234375,
1449
+ "logps/rejected": -300.59088134765625,
1450
+ "loss": 0.0675,
1451
+ "rewards/accuracies": 0.550000011920929,
1452
+ "rewards/chosen": 0.03031068481504917,
1453
+ "rewards/margins": 0.030671527609229088,
1454
+ "rewards/rejected": -0.00036084355087950826,
1455
+ "step": 920
1456
+ },
1457
+ {
1458
+ "epoch": 0.6,
1459
+ "learning_rate": 2.1075335985597954e-06,
1460
+ "logits/chosen": -0.9461824297904968,
1461
+ "logits/rejected": -0.9701375961303711,
1462
+ "logps/chosen": -309.2986755371094,
1463
+ "logps/rejected": -296.10052490234375,
1464
+ "loss": 0.0781,
1465
+ "rewards/accuracies": 0.45625001192092896,
1466
+ "rewards/chosen": 0.02147866040468216,
1467
+ "rewards/margins": 0.023677725344896317,
1468
+ "rewards/rejected": -0.002199061680585146,
1469
+ "step": 930
1470
+ },
1471
+ {
1472
+ "epoch": 0.6,
1473
+ "learning_rate": 2.0524292024568687e-06,
1474
+ "logits/chosen": -0.8931276202201843,
1475
+ "logits/rejected": -0.9411805272102356,
1476
+ "logps/chosen": -305.31793212890625,
1477
+ "logps/rejected": -297.4659729003906,
1478
+ "loss": 0.076,
1479
+ "rewards/accuracies": 0.4625000059604645,
1480
+ "rewards/chosen": 0.024056371301412582,
1481
+ "rewards/margins": 0.024243740364909172,
1482
+ "rewards/rejected": -0.00018737166828941554,
1483
+ "step": 940
1484
+ },
1485
+ {
1486
+ "epoch": 0.61,
1487
+ "learning_rate": 1.9975485706259194e-06,
1488
+ "logits/chosen": -0.9346133470535278,
1489
+ "logits/rejected": -0.9174652099609375,
1490
+ "logps/chosen": -243.5562744140625,
1491
+ "logps/rejected": -260.65185546875,
1492
+ "loss": 0.0812,
1493
+ "rewards/accuracies": 0.5,
1494
+ "rewards/chosen": 0.036083243787288666,
1495
+ "rewards/margins": 0.028012529015541077,
1496
+ "rewards/rejected": 0.00807071290910244,
1497
+ "step": 950
1498
+ },
1499
+ {
1500
+ "epoch": 0.61,
1501
+ "learning_rate": 1.942919140792319e-06,
1502
+ "logits/chosen": -0.9425550699234009,
1503
+ "logits/rejected": -0.8944212198257446,
1504
+ "logps/chosen": -263.16070556640625,
1505
+ "logps/rejected": -296.932861328125,
1506
+ "loss": 0.0774,
1507
+ "rewards/accuracies": 0.53125,
1508
+ "rewards/chosen": 0.03240296244621277,
1509
+ "rewards/margins": 0.03853650763630867,
1510
+ "rewards/rejected": -0.006133544258773327,
1511
+ "step": 960
1512
+ },
1513
+ {
1514
+ "epoch": 0.62,
1515
+ "learning_rate": 1.888568225092296e-06,
1516
+ "logits/chosen": -0.9160875082015991,
1517
+ "logits/rejected": -0.9029878377914429,
1518
+ "logps/chosen": -259.0009460449219,
1519
+ "logps/rejected": -284.0046081542969,
1520
+ "loss": 0.071,
1521
+ "rewards/accuracies": 0.4749999940395355,
1522
+ "rewards/chosen": 0.031025772914290428,
1523
+ "rewards/margins": 0.026529574766755104,
1524
+ "rewards/rejected": 0.004496193490922451,
1525
+ "step": 970
1526
+ },
1527
+ {
1528
+ "epoch": 0.63,
1529
+ "learning_rate": 1.8345229964181628e-06,
1530
+ "logits/chosen": -1.0157150030136108,
1531
+ "logits/rejected": -0.9280340075492859,
1532
+ "logps/chosen": -280.16455078125,
1533
+ "logps/rejected": -297.15655517578125,
1534
+ "loss": 0.0771,
1535
+ "rewards/accuracies": 0.5062500238418579,
1536
+ "rewards/chosen": 0.02662072703242302,
1537
+ "rewards/margins": 0.036351434886455536,
1538
+ "rewards/rejected": -0.009730703197419643,
1539
+ "step": 980
1540
+ },
1541
+ {
1542
+ "epoch": 0.63,
1543
+ "learning_rate": 1.7808104748331459e-06,
1544
+ "logits/chosen": -0.9648904800415039,
1545
+ "logits/rejected": -0.9164671897888184,
1546
+ "logps/chosen": -303.514892578125,
1547
+ "logps/rejected": -292.10748291015625,
1548
+ "loss": 0.0935,
1549
+ "rewards/accuracies": 0.5562499761581421,
1550
+ "rewards/chosen": 0.025951769202947617,
1551
+ "rewards/margins": 0.030179208144545555,
1552
+ "rewards/rejected": -0.004227438475936651,
1553
+ "step": 990
1554
+ },
1555
+ {
1556
+ "epoch": 0.64,
1557
+ "learning_rate": 1.7274575140626318e-06,
1558
+ "logits/chosen": -1.048002004623413,
1559
+ "logits/rejected": -0.9596036672592163,
1560
+ "logps/chosen": -297.2816467285156,
1561
+ "logps/rejected": -316.36981201171875,
1562
+ "loss": 0.0922,
1563
+ "rewards/accuracies": 0.48124998807907104,
1564
+ "rewards/chosen": 0.01601681485772133,
1565
+ "rewards/margins": 0.02387891337275505,
1566
+ "rewards/rejected": -0.007862097583711147,
1567
+ "step": 1000
1568
+ },
1569
+ {
1570
+ "epoch": 0.64,
1571
+ "eval_logits/chosen": -0.9184301495552063,
1572
+ "eval_logits/rejected": -0.8800927996635437,
1573
+ "eval_logps/chosen": -367.73187255859375,
1574
+ "eval_logps/rejected": -374.7669372558594,
1575
+ "eval_loss": 0.03902236372232437,
1576
+ "eval_rewards/accuracies": 0.5989999771118164,
1577
+ "eval_rewards/chosen": 0.028573375195264816,
1578
+ "eval_rewards/margins": 0.03610490262508392,
1579
+ "eval_rewards/rejected": -0.007531528826802969,
1580
+ "eval_runtime": 539.7163,
1581
+ "eval_samples_per_second": 3.706,
1582
+ "eval_steps_per_second": 0.926,
1583
+ "step": 1000
1584
+ },
1585
+ {
1586
+ "epoch": 0.65,
1587
+ "learning_rate": 1.6744907880685735e-06,
1588
+ "logits/chosen": -0.9194186925888062,
1589
+ "logits/rejected": -0.8924249410629272,
1590
+ "logps/chosen": -283.1637878417969,
1591
+ "logps/rejected": -281.9581604003906,
1592
+ "loss": 0.0699,
1593
+ "rewards/accuracies": 0.48750001192092896,
1594
+ "rewards/chosen": 0.02727659046649933,
1595
+ "rewards/margins": 0.020740380510687828,
1596
+ "rewards/rejected": 0.006536208093166351,
1597
+ "step": 1010
1598
+ },
1599
+ {
1600
+ "epoch": 0.65,
1601
+ "learning_rate": 1.6219367777137652e-06,
1602
+ "logits/chosen": -0.9643144607543945,
1603
+ "logits/rejected": -0.9084329605102539,
1604
+ "logps/chosen": -296.56732177734375,
1605
+ "logps/rejected": -299.80718994140625,
1606
+ "loss": 0.0756,
1607
+ "rewards/accuracies": 0.4437499940395355,
1608
+ "rewards/chosen": 0.02761662006378174,
1609
+ "rewards/margins": 0.022495564073324203,
1610
+ "rewards/rejected": 0.00512105505913496,
1611
+ "step": 1020
1612
+ },
1613
+ {
1614
+ "epoch": 0.66,
1615
+ "learning_rate": 1.569821757522666e-06,
1616
+ "logits/chosen": -0.9250129461288452,
1617
+ "logits/rejected": -0.8530179262161255,
1618
+ "logps/chosen": -305.6438903808594,
1619
+ "logps/rejected": -310.8564758300781,
1620
+ "loss": 0.0796,
1621
+ "rewards/accuracies": 0.518750011920929,
1622
+ "rewards/chosen": 0.023295477032661438,
1623
+ "rewards/margins": 0.03615058213472366,
1624
+ "rewards/rejected": -0.012855103239417076,
1625
+ "step": 1030
1626
+ },
1627
+ {
1628
+ "epoch": 0.67,
1629
+ "learning_rate": 1.5181717825453732e-06,
1630
+ "logits/chosen": -0.9920689463615417,
1631
+ "logits/rejected": -0.9287697076797485,
1632
+ "logps/chosen": -268.88922119140625,
1633
+ "logps/rejected": -297.29791259765625,
1634
+ "loss": 0.0907,
1635
+ "rewards/accuracies": 0.44999998807907104,
1636
+ "rewards/chosen": 0.025286570191383362,
1637
+ "rewards/margins": 0.03552253916859627,
1638
+ "rewards/rejected": -0.010235967114567757,
1639
+ "step": 1040
1640
+ },
1641
+ {
1642
+ "epoch": 0.67,
1643
+ "learning_rate": 1.4670126753313286e-06,
1644
+ "logits/chosen": -0.9325569272041321,
1645
+ "logits/rejected": -0.9226212501525879,
1646
+ "logps/chosen": -273.0399169921875,
1647
+ "logps/rejected": -300.64337158203125,
1648
+ "loss": 0.0829,
1649
+ "rewards/accuracies": 0.5,
1650
+ "rewards/chosen": 0.02183723822236061,
1651
+ "rewards/margins": 0.02524760365486145,
1652
+ "rewards/rejected": -0.0034103658981621265,
1653
+ "step": 1050
1654
+ },
1655
+ {
1656
+ "epoch": 0.68,
1657
+ "learning_rate": 1.4163700130192627e-06,
1658
+ "logits/chosen": -0.9867246747016907,
1659
+ "logits/rejected": -0.9270626306533813,
1660
+ "logps/chosen": -240.3236846923828,
1661
+ "logps/rejected": -277.5113525390625,
1662
+ "loss": 0.1007,
1663
+ "rewards/accuracies": 0.4000000059604645,
1664
+ "rewards/chosen": 0.0198238343000412,
1665
+ "rewards/margins": 0.027476917952299118,
1666
+ "rewards/rejected": -0.007653082255274057,
1667
+ "step": 1060
1668
+ },
1669
+ {
1670
+ "epoch": 0.68,
1671
+ "learning_rate": 1.366269114549833e-06,
1672
+ "logits/chosen": -0.9571270942687988,
1673
+ "logits/rejected": -0.9630721807479858,
1674
+ "logps/chosen": -282.31561279296875,
1675
+ "logps/rejected": -273.7547607421875,
1676
+ "loss": 0.0832,
1677
+ "rewards/accuracies": 0.4312500059604645,
1678
+ "rewards/chosen": 0.014912809245288372,
1679
+ "rewards/margins": 0.02561432123184204,
1680
+ "rewards/rejected": -0.01070151012390852,
1681
+ "step": 1070
1682
+ },
1683
+ {
1684
+ "epoch": 0.69,
1685
+ "learning_rate": 1.3167350280073514e-06,
1686
+ "logits/chosen": -0.9449627995491028,
1687
+ "logits/rejected": -0.9669274091720581,
1688
+ "logps/chosen": -290.40899658203125,
1689
+ "logps/rejected": -305.1846923828125,
1690
+ "loss": 0.0881,
1691
+ "rewards/accuracies": 0.45625001192092896,
1692
+ "rewards/chosen": 0.020313743501901627,
1693
+ "rewards/margins": 0.03765721619129181,
1694
+ "rewards/rejected": -0.017343472689390182,
1695
+ "step": 1080
1696
+ },
1697
+ {
1698
+ "epoch": 0.7,
1699
+ "learning_rate": 1.267792518096918e-06,
1700
+ "logits/chosen": -0.954971194267273,
1701
+ "logits/rejected": -1.0019423961639404,
1702
+ "logps/chosen": -314.25335693359375,
1703
+ "logps/rejected": -375.009521484375,
1704
+ "loss": 0.0652,
1705
+ "rewards/accuracies": 0.5562499761581421,
1706
+ "rewards/chosen": 0.023724760860204697,
1707
+ "rewards/margins": 0.04069076478481293,
1708
+ "rewards/rejected": -0.01696600392460823,
1709
+ "step": 1090
1710
+ },
1711
+ {
1712
+ "epoch": 0.7,
1713
+ "learning_rate": 1.2194660537632423e-06,
1714
+ "logits/chosen": -0.9429014325141907,
1715
+ "logits/rejected": -0.9538652300834656,
1716
+ "logps/chosen": -279.4613342285156,
1717
+ "logps/rejected": -323.7087097167969,
1718
+ "loss": 0.0703,
1719
+ "rewards/accuracies": 0.4749999940395355,
1720
+ "rewards/chosen": 0.02060026489198208,
1721
+ "rewards/margins": 0.03607643395662308,
1722
+ "rewards/rejected": -0.015476171858608723,
1723
+ "step": 1100
1724
+ },
1725
+ {
1726
+ "epoch": 0.7,
1727
+ "eval_logits/chosen": -0.9601659178733826,
1728
+ "eval_logits/rejected": -0.9299748539924622,
1729
+ "eval_logps/chosen": -373.62261962890625,
1730
+ "eval_logps/rejected": -383.30255126953125,
1731
+ "eval_loss": 0.03885647654533386,
1732
+ "eval_rewards/accuracies": 0.6000000238418579,
1733
+ "eval_rewards/chosen": 0.022682595998048782,
1734
+ "eval_rewards/margins": 0.03874973580241203,
1735
+ "eval_rewards/rejected": -0.016067136079072952,
1736
+ "eval_runtime": 539.6341,
1737
+ "eval_samples_per_second": 3.706,
1738
+ "eval_steps_per_second": 0.927,
1739
+ "step": 1100
1740
+ },
1741
+ {
1742
+ "epoch": 0.71,
1743
+ "learning_rate": 1.1717797959573262e-06,
1744
+ "logits/chosen": -1.003103494644165,
1745
+ "logits/rejected": -0.9326059222221375,
1746
+ "logps/chosen": -297.8578186035156,
1747
+ "logps/rejected": -297.1626892089844,
1748
+ "loss": 0.0771,
1749
+ "rewards/accuracies": 0.5,
1750
+ "rewards/chosen": 0.017790749669075012,
1751
+ "rewards/margins": 0.035782478749752045,
1752
+ "rewards/rejected": -0.017991727218031883,
1753
+ "step": 1110
1754
+ },
1755
+ {
1756
+ "epoch": 0.72,
1757
+ "learning_rate": 1.1247575855571251e-06,
1758
+ "logits/chosen": -0.9976641535758972,
1759
+ "logits/rejected": -0.9637433886528015,
1760
+ "logps/chosen": -306.8631896972656,
1761
+ "logps/rejected": -350.14324951171875,
1762
+ "loss": 0.081,
1763
+ "rewards/accuracies": 0.543749988079071,
1764
+ "rewards/chosen": 0.029013922438025475,
1765
+ "rewards/margins": 0.05382751673460007,
1766
+ "rewards/rejected": -0.02481359988451004,
1767
+ "step": 1120
1768
+ },
1769
+ {
1770
+ "epoch": 0.72,
1771
+ "learning_rate": 1.07842293144824e-06,
1772
+ "logits/chosen": -0.9757798314094543,
1773
+ "logits/rejected": -0.9444389343261719,
1774
+ "logps/chosen": -312.77191162109375,
1775
+ "logps/rejected": -317.60406494140625,
1776
+ "loss": 0.0816,
1777
+ "rewards/accuracies": 0.5,
1778
+ "rewards/chosen": 0.015693334862589836,
1779
+ "rewards/margins": 0.029305079951882362,
1780
+ "rewards/rejected": -0.013611746951937675,
1781
+ "step": 1130
1782
+ },
1783
+ {
1784
+ "epoch": 0.73,
1785
+ "learning_rate": 1.0327989987705781e-06,
1786
+ "logits/chosen": -1.0041489601135254,
1787
+ "logits/rejected": -1.006225824356079,
1788
+ "logps/chosen": -291.49542236328125,
1789
+ "logps/rejected": -287.52978515625,
1790
+ "loss": 0.0888,
1791
+ "rewards/accuracies": 0.4937500059604645,
1792
+ "rewards/chosen": 0.01858539506793022,
1793
+ "rewards/margins": 0.029126202687621117,
1794
+ "rewards/rejected": -0.01054080855101347,
1795
+ "step": 1140
1796
+ },
1797
+ {
1798
+ "epoch": 0.74,
1799
+ "learning_rate": 9.879085973368805e-07,
1800
+ "logits/chosen": -1.0146872997283936,
1801
+ "logits/rejected": -0.9577957987785339,
1802
+ "logps/chosen": -322.76556396484375,
1803
+ "logps/rejected": -336.08636474609375,
1804
+ "loss": 0.0665,
1805
+ "rewards/accuracies": 0.6000000238418579,
1806
+ "rewards/chosen": 0.023015085607767105,
1807
+ "rewards/margins": 0.05666361376643181,
1808
+ "rewards/rejected": -0.0336485281586647,
1809
+ "step": 1150
1810
+ },
1811
+ {
1812
+ "epoch": 0.74,
1813
+ "learning_rate": 9.437741702288908e-07,
1814
+ "logits/chosen": -1.0026918649673462,
1815
+ "logits/rejected": -0.9933171272277832,
1816
+ "logps/chosen": -299.72308349609375,
1817
+ "logps/rejected": -311.6734924316406,
1818
+ "loss": 0.0794,
1819
+ "rewards/accuracies": 0.4437499940395355,
1820
+ "rewards/chosen": 0.01335589587688446,
1821
+ "rewards/margins": 0.025558674708008766,
1822
+ "rewards/rejected": -0.012202778831124306,
1823
+ "step": 1160
1824
+ },
1825
+ {
1826
+ "epoch": 0.75,
1827
+ "learning_rate": 9.004177825768751e-07,
1828
+ "logits/chosen": -1.0369510650634766,
1829
+ "logits/rejected": -0.9392277002334595,
1830
+ "logps/chosen": -267.6405334472656,
1831
+ "logps/rejected": -298.1479797363281,
1832
+ "loss": 0.0757,
1833
+ "rewards/accuracies": 0.45625001192092896,
1834
+ "rewards/chosen": 0.012007731013000011,
1835
+ "rewards/margins": 0.024681296199560165,
1836
+ "rewards/rejected": -0.012673566117882729,
1837
+ "step": 1170
1838
+ },
1839
+ {
1840
+ "epoch": 0.76,
1841
+ "learning_rate": 8.578611105280987e-07,
1842
+ "logits/chosen": -1.0083134174346924,
1843
+ "logits/rejected": -0.992781937122345,
1844
+ "logps/chosen": -316.5964660644531,
1845
+ "logps/rejected": -365.96661376953125,
1846
+ "loss": 0.0643,
1847
+ "rewards/accuracies": 0.5625,
1848
+ "rewards/chosen": 0.02335866168141365,
1849
+ "rewards/margins": 0.050595641136169434,
1850
+ "rewards/rejected": -0.027236973866820335,
1851
+ "step": 1180
1852
+ },
1853
+ {
1854
+ "epoch": 0.76,
1855
+ "learning_rate": 8.161254304097715e-07,
1856
+ "logits/chosen": -1.026871919631958,
1857
+ "logits/rejected": -0.9791032075881958,
1858
+ "logps/chosen": -277.67547607421875,
1859
+ "logps/rejected": -294.22369384765625,
1860
+ "loss": 0.0853,
1861
+ "rewards/accuracies": 0.42500001192092896,
1862
+ "rewards/chosen": 0.02239377610385418,
1863
+ "rewards/margins": 0.026370327919721603,
1864
+ "rewards/rejected": -0.003976552281528711,
1865
+ "step": 1190
1866
+ },
1867
+ {
1868
+ "epoch": 0.77,
1869
+ "learning_rate": 7.752316080918934e-07,
1870
+ "logits/chosen": -0.942740797996521,
1871
+ "logits/rejected": -0.9156352877616882,
1872
+ "logps/chosen": -290.0126953125,
1873
+ "logps/rejected": -303.8479919433594,
1874
+ "loss": 0.0746,
1875
+ "rewards/accuracies": 0.4375,
1876
+ "rewards/chosen": 0.021792907267808914,
1877
+ "rewards/margins": 0.02630285918712616,
1878
+ "rewards/rejected": -0.004509954713284969,
1879
+ "step": 1200
1880
+ },
1881
+ {
1882
+ "epoch": 0.77,
1883
+ "eval_logits/chosen": -0.930611789226532,
1884
+ "eval_logits/rejected": -0.8943980932235718,
1885
+ "eval_logps/chosen": -373.7153015136719,
1886
+ "eval_logps/rejected": -385.1601257324219,
1887
+ "eval_loss": 0.03884938731789589,
1888
+ "eval_rewards/accuracies": 0.6050000190734863,
1889
+ "eval_rewards/chosen": 0.022589918226003647,
1890
+ "eval_rewards/margins": 0.04051463305950165,
1891
+ "eval_rewards/rejected": -0.017924712970852852,
1892
+ "eval_runtime": 539.5465,
1893
+ "eval_samples_per_second": 3.707,
1894
+ "eval_steps_per_second": 0.927,
1895
+ "step": 1200
1896
+ },
1897
+ {
1898
+ "epoch": 0.77,
1899
+ "learning_rate": 7.352000885553012e-07,
1900
+ "logits/chosen": -0.9893749356269836,
1901
+ "logits/rejected": -0.9994084239006042,
1902
+ "logps/chosen": -307.1074523925781,
1903
+ "logps/rejected": -315.70770263671875,
1904
+ "loss": 0.0767,
1905
+ "rewards/accuracies": 0.48750001192092896,
1906
+ "rewards/chosen": 0.016450155526399612,
1907
+ "rewards/margins": 0.02466857247054577,
1908
+ "rewards/rejected": -0.008218420669436455,
1909
+ "step": 1210
1910
+ },
1911
+ {
1912
+ "epoch": 0.78,
1913
+ "learning_rate": 6.960508856701464e-07,
1914
+ "logits/chosen": -1.0321322679519653,
1915
+ "logits/rejected": -0.9207181930541992,
1916
+ "logps/chosen": -291.80029296875,
1917
+ "logps/rejected": -298.3025817871094,
1918
+ "loss": 0.0945,
1919
+ "rewards/accuracies": 0.46875,
1920
+ "rewards/chosen": 0.015555900521576405,
1921
+ "rewards/margins": 0.02910393476486206,
1922
+ "rewards/rejected": -0.013548034243285656,
1923
+ "step": 1220
1924
+ },
1925
+ {
1926
+ "epoch": 0.79,
1927
+ "learning_rate": 6.578035721899029e-07,
1928
+ "logits/chosen": -0.9535512924194336,
1929
+ "logits/rejected": -1.008029580116272,
1930
+ "logps/chosen": -324.25152587890625,
1931
+ "logps/rejected": -346.2152099609375,
1932
+ "loss": 0.0693,
1933
+ "rewards/accuracies": 0.5062500238418579,
1934
+ "rewards/chosen": 0.012142010033130646,
1935
+ "rewards/margins": 0.03695748746395111,
1936
+ "rewards/rejected": -0.024815475568175316,
1937
+ "step": 1230
1938
+ },
1939
+ {
1940
+ "epoch": 0.79,
1941
+ "learning_rate": 6.204772699659126e-07,
1942
+ "logits/chosen": -1.0189663171768188,
1943
+ "logits/rejected": -1.051673173904419,
1944
+ "logps/chosen": -268.8255920410156,
1945
+ "logps/rejected": -264.9363708496094,
1946
+ "loss": 0.0811,
1947
+ "rewards/accuracies": 0.44999998807907104,
1948
+ "rewards/chosen": 0.022226419299840927,
1949
+ "rewards/margins": 0.02963913045823574,
1950
+ "rewards/rejected": -0.007412709295749664,
1951
+ "step": 1240
1952
+ },
1953
+ {
1954
+ "epoch": 0.8,
1955
+ "learning_rate": 5.840906403873648e-07,
1956
+ "logits/chosen": -0.9550544023513794,
1957
+ "logits/rejected": -0.9686701893806458,
1958
+ "logps/chosen": -311.6850280761719,
1959
+ "logps/rejected": -331.0824279785156,
1960
+ "loss": 0.0602,
1961
+ "rewards/accuracies": 0.518750011920929,
1962
+ "rewards/chosen": 0.01663174293935299,
1963
+ "rewards/margins": 0.03210796043276787,
1964
+ "rewards/rejected": -0.015476214699447155,
1965
+ "step": 1250
1966
+ },
1967
+ {
1968
+ "epoch": 0.81,
1969
+ "learning_rate": 5.486618750514813e-07,
1970
+ "logits/chosen": -0.9893242716789246,
1971
+ "logits/rejected": -0.9823307991027832,
1972
+ "logps/chosen": -319.13421630859375,
1973
+ "logps/rejected": -335.4979553222656,
1974
+ "loss": 0.0904,
1975
+ "rewards/accuracies": 0.48124998807907104,
1976
+ "rewards/chosen": 0.019062984734773636,
1977
+ "rewards/margins": 0.03206023946404457,
1978
+ "rewards/rejected": -0.012997254729270935,
1979
+ "step": 1260
1980
+ },
1981
+ {
1982
+ "epoch": 0.81,
1983
+ "learning_rate": 5.142086866685783e-07,
1984
+ "logits/chosen": -0.9724270701408386,
1985
+ "logits/rejected": -0.9491437673568726,
1986
+ "logps/chosen": -279.63824462890625,
1987
+ "logps/rejected": -299.930908203125,
1988
+ "loss": 0.0748,
1989
+ "rewards/accuracies": 0.4437499940395355,
1990
+ "rewards/chosen": 0.01930154860019684,
1991
+ "rewards/margins": 0.022973302751779556,
1992
+ "rewards/rejected": -0.0036717529874294996,
1993
+ "step": 1270
1994
+ },
1995
+ {
1996
+ "epoch": 0.82,
1997
+ "learning_rate": 4.807483002065439e-07,
1998
+ "logits/chosen": -0.9794095754623413,
1999
+ "logits/rejected": -0.9791723489761353,
2000
+ "logps/chosen": -263.2690124511719,
2001
+ "logps/rejected": -264.7826232910156,
2002
+ "loss": 0.0906,
2003
+ "rewards/accuracies": 0.48124998807907104,
2004
+ "rewards/chosen": 0.020124919712543488,
2005
+ "rewards/margins": 0.028638970106840134,
2006
+ "rewards/rejected": -0.008514048531651497,
2007
+ "step": 1280
2008
+ },
2009
+ {
2010
+ "epoch": 0.83,
2011
+ "learning_rate": 4.4829744427917153e-07,
2012
+ "logits/chosen": -1.0250556468963623,
2013
+ "logits/rejected": -0.9449254870414734,
2014
+ "logps/chosen": -262.35614013671875,
2015
+ "logps/rejected": -292.6833190917969,
2016
+ "loss": 0.0725,
2017
+ "rewards/accuracies": 0.543749988079071,
2018
+ "rewards/chosen": 0.029375359416007996,
2019
+ "rewards/margins": 0.04200378805398941,
2020
+ "rewards/rejected": -0.012628423981368542,
2021
+ "step": 1290
2022
+ },
2023
+ {
2024
+ "epoch": 0.83,
2025
+ "learning_rate": 4.168723427826382e-07,
2026
+ "logits/chosen": -0.964804470539093,
2027
+ "logits/rejected": -0.9295471906661987,
2028
+ "logps/chosen": -272.17938232421875,
2029
+ "logps/rejected": -284.2236022949219,
2030
+ "loss": 0.0925,
2031
+ "rewards/accuracies": 0.46875,
2032
+ "rewards/chosen": 0.01671653613448143,
2033
+ "rewards/margins": 0.0240671094506979,
2034
+ "rewards/rejected": -0.007350574247539043,
2035
+ "step": 1300
2036
+ },
2037
+ {
2038
+ "epoch": 0.83,
2039
+ "eval_logits/chosen": -0.949418306350708,
2040
+ "eval_logits/rejected": -0.9171380400657654,
2041
+ "eval_logps/chosen": -370.0339660644531,
2042
+ "eval_logps/rejected": -380.30718994140625,
2043
+ "eval_loss": 0.03874335065484047,
2044
+ "eval_rewards/accuracies": 0.6029999852180481,
2045
+ "eval_rewards/chosen": 0.02627129666507244,
2046
+ "eval_rewards/margins": 0.039343029260635376,
2047
+ "eval_rewards/rejected": -0.013071730732917786,
2048
+ "eval_runtime": 539.7909,
2049
+ "eval_samples_per_second": 3.705,
2050
+ "eval_steps_per_second": 0.926,
2051
+ "step": 1300
2052
+ },
2053
+ {
2054
+ "epoch": 0.84,
2055
+ "learning_rate": 3.864887067843251e-07,
2056
+ "logits/chosen": -0.9890697598457336,
2057
+ "logits/rejected": -0.8923286199569702,
2058
+ "logps/chosen": -310.97125244140625,
2059
+ "logps/rejected": -309.845703125,
2060
+ "loss": 0.0789,
2061
+ "rewards/accuracies": 0.44999998807907104,
2062
+ "rewards/chosen": 0.019753634929656982,
2063
+ "rewards/margins": 0.031969424337148666,
2064
+ "rewards/rejected": -0.01221578847616911,
2065
+ "step": 1310
2066
+ },
2067
+ {
2068
+ "epoch": 0.84,
2069
+ "learning_rate": 3.5716172666802637e-07,
2070
+ "logits/chosen": -1.0090614557266235,
2071
+ "logits/rejected": -0.9583064317703247,
2072
+ "logps/chosen": -266.8111267089844,
2073
+ "logps/rejected": -302.0957946777344,
2074
+ "loss": 0.0764,
2075
+ "rewards/accuracies": 0.4749999940395355,
2076
+ "rewards/chosen": 0.023147406056523323,
2077
+ "rewards/margins": 0.04169207811355591,
2078
+ "rewards/rejected": -0.018544670194387436,
2079
+ "step": 1320
2080
+ },
2081
+ {
2082
+ "epoch": 0.85,
2083
+ "learning_rate": 3.289060645394704e-07,
2084
+ "logits/chosen": -0.9466385841369629,
2085
+ "logits/rejected": -0.9388877749443054,
2086
+ "logps/chosen": -291.6334533691406,
2087
+ "logps/rejected": -310.3324279785156,
2088
+ "loss": 0.094,
2089
+ "rewards/accuracies": 0.48750001192092896,
2090
+ "rewards/chosen": 0.021957775577902794,
2091
+ "rewards/margins": 0.026331758126616478,
2092
+ "rewards/rejected": -0.0043739816173911095,
2093
+ "step": 1330
2094
+ },
2095
+ {
2096
+ "epoch": 0.86,
2097
+ "learning_rate": 3.0173584689596246e-07,
2098
+ "logits/chosen": -0.9820948839187622,
2099
+ "logits/rejected": -0.9631234407424927,
2100
+ "logps/chosen": -290.67987060546875,
2101
+ "logps/rejected": -281.71405029296875,
2102
+ "loss": 0.0816,
2103
+ "rewards/accuracies": 0.5062500238418579,
2104
+ "rewards/chosen": 0.026712050661444664,
2105
+ "rewards/margins": 0.026149820536375046,
2106
+ "rewards/rejected": 0.0005622319877147675,
2107
+ "step": 1340
2108
+ },
2109
+ {
2110
+ "epoch": 0.86,
2111
+ "learning_rate": 2.756646575638025e-07,
2112
+ "logits/chosen": -0.9123773574829102,
2113
+ "logits/rejected": -0.9757976531982422,
2114
+ "logps/chosen": -270.1868896484375,
2115
+ "logps/rejected": -303.7170715332031,
2116
+ "loss": 0.0753,
2117
+ "rewards/accuracies": 0.46875,
2118
+ "rewards/chosen": 0.025742124766111374,
2119
+ "rewards/margins": 0.034964997321367264,
2120
+ "rewards/rejected": -0.009222874417901039,
2121
+ "step": 1350
2122
+ },
2123
+ {
2124
+ "epoch": 0.87,
2125
+ "learning_rate": 2.507055309070111e-07,
2126
+ "logits/chosen": -0.9779159426689148,
2127
+ "logits/rejected": -0.9737545847892761,
2128
+ "logps/chosen": -285.8861083984375,
2129
+ "logps/rejected": -280.3493347167969,
2130
+ "loss": 0.0912,
2131
+ "rewards/accuracies": 0.48124998807907104,
2132
+ "rewards/chosen": 0.021084221079945564,
2133
+ "rewards/margins": 0.029098382219672203,
2134
+ "rewards/rejected": -0.008014162071049213,
2135
+ "step": 1360
2136
+ },
2137
+ {
2138
+ "epoch": 0.88,
2139
+ "learning_rate": 2.2687094531076565e-07,
2140
+ "logits/chosen": -1.0015113353729248,
2141
+ "logits/rejected": -0.9118536114692688,
2142
+ "logps/chosen": -292.0755920410156,
2143
+ "logps/rejected": -336.7613830566406,
2144
+ "loss": 0.0668,
2145
+ "rewards/accuracies": 0.48750001192092896,
2146
+ "rewards/chosen": 0.023335173726081848,
2147
+ "rewards/margins": 0.042082760483026505,
2148
+ "rewards/rejected": -0.018747588619589806,
2149
+ "step": 1370
2150
+ },
2151
+ {
2152
+ "epoch": 0.88,
2153
+ "learning_rate": 2.0417281694279424e-07,
2154
+ "logits/chosen": -0.9744800329208374,
2155
+ "logits/rejected": -0.8867910504341125,
2156
+ "logps/chosen": -322.03643798828125,
2157
+ "logps/rejected": -347.53192138671875,
2158
+ "loss": 0.0651,
2159
+ "rewards/accuracies": 0.53125,
2160
+ "rewards/chosen": 0.016095371916890144,
2161
+ "rewards/margins": 0.03282975032925606,
2162
+ "rewards/rejected": -0.016734374687075615,
2163
+ "step": 1380
2164
+ },
2165
+ {
2166
+ "epoch": 0.89,
2167
+ "learning_rate": 1.8262249379585484e-07,
2168
+ "logits/chosen": -0.8901314735412598,
2169
+ "logits/rejected": -0.9079896807670593,
2170
+ "logps/chosen": -318.6395568847656,
2171
+ "logps/rejected": -306.5535583496094,
2172
+ "loss": 0.0696,
2173
+ "rewards/accuracies": 0.4749999940395355,
2174
+ "rewards/chosen": 0.02375376783311367,
2175
+ "rewards/margins": 0.02338986098766327,
2176
+ "rewards/rejected": 0.00036390620516613126,
2177
+ "step": 1390
2178
+ },
2179
+ {
2180
+ "epoch": 0.9,
2181
+ "learning_rate": 1.6223075001427667e-07,
2182
+ "logits/chosen": -0.9552518725395203,
2183
+ "logits/rejected": -0.9494439959526062,
2184
+ "logps/chosen": -281.0766906738281,
2185
+ "logps/rejected": -289.63055419921875,
2186
+ "loss": 0.0863,
2187
+ "rewards/accuracies": 0.40625,
2188
+ "rewards/chosen": 0.010858943685889244,
2189
+ "rewards/margins": 0.015300577506422997,
2190
+ "rewards/rejected": -0.004441632889211178,
2191
+ "step": 1400
2192
+ },
2193
+ {
2194
+ "epoch": 0.9,
2195
+ "eval_logits/chosen": -0.9446871280670166,
2196
+ "eval_logits/rejected": -0.9120718240737915,
2197
+ "eval_logps/chosen": -369.4450378417969,
2198
+ "eval_logps/rejected": -379.5608215332031,
2199
+ "eval_loss": 0.038691576570272446,
2200
+ "eval_rewards/accuracies": 0.6054999828338623,
2201
+ "eval_rewards/chosen": 0.02686023712158203,
2202
+ "eval_rewards/margins": 0.03918563574552536,
2203
+ "eval_rewards/rejected": -0.01232539676129818,
2204
+ "eval_runtime": 539.9174,
2205
+ "eval_samples_per_second": 3.704,
2206
+ "eval_steps_per_second": 0.926,
2207
+ "step": 1400
2208
+ },
2209
+ {
2210
+ "epoch": 0.9,
2211
+ "learning_rate": 1.4300778050739317e-07,
2212
+ "logits/chosen": -0.9774508476257324,
2213
+ "logits/rejected": -0.9408830404281616,
2214
+ "logps/chosen": -298.72247314453125,
2215
+ "logps/rejected": -294.75408935546875,
2216
+ "loss": 0.0665,
2217
+ "rewards/accuracies": 0.4437499940395355,
2218
+ "rewards/chosen": 0.01832416281104088,
2219
+ "rewards/margins": 0.014284146018326283,
2220
+ "rewards/rejected": 0.004040017258375883,
2221
+ "step": 1410
2222
+ },
2223
+ {
2224
+ "epoch": 0.91,
2225
+ "learning_rate": 1.2496319585257183e-07,
2226
+ "logits/chosen": -0.9892775416374207,
2227
+ "logits/rejected": -0.9716461300849915,
2228
+ "logps/chosen": -323.1170349121094,
2229
+ "logps/rejected": -303.7762451171875,
2230
+ "loss": 0.0854,
2231
+ "rewards/accuracies": 0.518750011920929,
2232
+ "rewards/chosen": 0.02715294435620308,
2233
+ "rewards/margins": 0.03064989112317562,
2234
+ "rewards/rejected": -0.0034969463013112545,
2235
+ "step": 1420
2236
+ },
2237
+ {
2238
+ "epoch": 0.92,
2239
+ "learning_rate": 1.0810601749037669e-07,
2240
+ "logits/chosen": -0.9020091891288757,
2241
+ "logits/rejected": -0.9708755612373352,
2242
+ "logps/chosen": -289.08258056640625,
2243
+ "logps/rejected": -339.64703369140625,
2244
+ "loss": 0.0713,
2245
+ "rewards/accuracies": 0.4625000059604645,
2246
+ "rewards/chosen": 0.023048024624586105,
2247
+ "rewards/margins": 0.04538671672344208,
2248
+ "rewards/rejected": -0.022338688373565674,
2249
+ "step": 1430
2250
+ },
2251
+ {
2252
+ "epoch": 0.92,
2253
+ "learning_rate": 9.244467321427585e-08,
2254
+ "logits/chosen": -1.0033928155899048,
2255
+ "logits/rejected": -1.0245609283447266,
2256
+ "logps/chosen": -276.730712890625,
2257
+ "logps/rejected": -303.7331237792969,
2258
+ "loss": 0.0785,
2259
+ "rewards/accuracies": 0.6000000238418579,
2260
+ "rewards/chosen": 0.033762019127607346,
2261
+ "rewards/margins": 0.05797597020864487,
2262
+ "rewards/rejected": -0.024213949218392372,
2263
+ "step": 1440
2264
+ },
2265
+ {
2266
+ "epoch": 0.93,
2267
+ "learning_rate": 7.798699295714002e-08,
2268
+ "logits/chosen": -0.9668358564376831,
2269
+ "logits/rejected": -0.9086298942565918,
2270
+ "logps/chosen": -320.6669006347656,
2271
+ "logps/rejected": -296.5704345703125,
2272
+ "loss": 0.0777,
2273
+ "rewards/accuracies": 0.5249999761581421,
2274
+ "rewards/chosen": 0.023420458659529686,
2275
+ "rewards/margins": 0.03309481590986252,
2276
+ "rewards/rejected": -0.009674356319010258,
2277
+ "step": 1450
2278
+ },
2279
+ {
2280
+ "epoch": 0.93,
2281
+ "learning_rate": 6.474020487664934e-08,
2282
+ "logits/chosen": -0.9561643600463867,
2283
+ "logits/rejected": -0.9422094225883484,
2284
+ "logps/chosen": -276.42156982421875,
2285
+ "logps/rejected": -291.9980163574219,
2286
+ "loss": 0.076,
2287
+ "rewards/accuracies": 0.53125,
2288
+ "rewards/chosen": 0.023049993440508842,
2289
+ "rewards/margins": 0.038113731890916824,
2290
+ "rewards/rejected": -0.01506374217569828,
2291
+ "step": 1460
2292
+ },
2293
+ {
2294
+ "epoch": 0.94,
2295
+ "learning_rate": 5.271093174155223e-08,
2296
+ "logits/chosen": -1.0463026762008667,
2297
+ "logits/rejected": -0.9269806742668152,
2298
+ "logps/chosen": -271.7001037597656,
2299
+ "logps/rejected": -273.11419677734375,
2300
+ "loss": 0.0773,
2301
+ "rewards/accuracies": 0.45625001192092896,
2302
+ "rewards/chosen": 0.02771763503551483,
2303
+ "rewards/margins": 0.02654975652694702,
2304
+ "rewards/rejected": 0.0011678790906444192,
2305
+ "step": 1470
2306
+ },
2307
+ {
2308
+ "epoch": 0.95,
2309
+ "learning_rate": 4.190518762059587e-08,
2310
+ "logits/chosen": -0.9642139673233032,
2311
+ "logits/rejected": -0.9358107447624207,
2312
+ "logps/chosen": -301.32647705078125,
2313
+ "logps/rejected": -310.1565856933594,
2314
+ "loss": 0.0955,
2315
+ "rewards/accuracies": 0.4937500059604645,
2316
+ "rewards/chosen": 0.019302543252706528,
2317
+ "rewards/margins": 0.034524787217378616,
2318
+ "rewards/rejected": -0.015222239308059216,
2319
+ "step": 1480
2320
+ },
2321
+ {
2322
+ "epoch": 0.95,
2323
+ "learning_rate": 3.232837487577384e-08,
2324
+ "logits/chosen": -0.9852960705757141,
2325
+ "logits/rejected": -0.9471799731254578,
2326
+ "logps/chosen": -306.68548583984375,
2327
+ "logps/rejected": -294.8326416015625,
2328
+ "loss": 0.0895,
2329
+ "rewards/accuracies": 0.5062500238418579,
2330
+ "rewards/chosen": 0.018351960927248,
2331
+ "rewards/margins": 0.04016681760549545,
2332
+ "rewards/rejected": -0.0218148622661829,
2333
+ "step": 1490
2334
+ },
2335
+ {
2336
+ "epoch": 0.96,
2337
+ "learning_rate": 2.39852814614025e-08,
2338
+ "logits/chosen": -0.9789448976516724,
2339
+ "logits/rejected": -0.9438329935073853,
2340
+ "logps/chosen": -281.9150085449219,
2341
+ "logps/rejected": -267.4187316894531,
2342
+ "loss": 0.0904,
2343
+ "rewards/accuracies": 0.4937500059604645,
2344
+ "rewards/chosen": 0.019616561010479927,
2345
+ "rewards/margins": 0.023770466446876526,
2346
+ "rewards/rejected": -0.004153906367719173,
2347
+ "step": 1500
2348
+ },
2349
+ {
2350
+ "epoch": 0.96,
2351
+ "eval_logits/chosen": -0.9535864591598511,
2352
+ "eval_logits/rejected": -0.9203009009361267,
2353
+ "eval_logps/chosen": -369.494384765625,
2354
+ "eval_logps/rejected": -379.5999755859375,
2355
+ "eval_loss": 0.03863796219229698,
2356
+ "eval_rewards/accuracies": 0.6044999957084656,
2357
+ "eval_rewards/chosen": 0.026810916140675545,
2358
+ "eval_rewards/margins": 0.03917544335126877,
2359
+ "eval_rewards/rejected": -0.012364527210593224,
2360
+ "eval_runtime": 539.9867,
2361
+ "eval_samples_per_second": 3.704,
2362
+ "eval_steps_per_second": 0.926,
2363
+ "step": 1500
2364
+ },
2365
+ {
2366
+ "epoch": 0.97,
2367
+ "learning_rate": 1.6880078530367716e-08,
2368
+ "logits/chosen": -0.9464886784553528,
2369
+ "logits/rejected": -0.9203370213508606,
2370
+ "logps/chosen": -291.42730712890625,
2371
+ "logps/rejected": -302.47357177734375,
2372
+ "loss": 0.0968,
2373
+ "rewards/accuracies": 0.48750001192092896,
2374
+ "rewards/chosen": 0.018558986485004425,
2375
+ "rewards/margins": 0.03005158342421055,
2376
+ "rewards/rejected": -0.011492597870528698,
2377
+ "step": 1510
2378
+ },
2379
+ {
2380
+ "epoch": 0.97,
2381
+ "learning_rate": 1.1016318348746058e-08,
2382
+ "logits/chosen": -0.9404022097587585,
2383
+ "logits/rejected": -0.9021556973457336,
2384
+ "logps/chosen": -266.0870666503906,
2385
+ "logps/rejected": -277.4389343261719,
2386
+ "loss": 0.1015,
2387
+ "rewards/accuracies": 0.44999998807907104,
2388
+ "rewards/chosen": 0.021859928965568542,
2389
+ "rewards/margins": 0.028949573636054993,
2390
+ "rewards/rejected": -0.007089647464454174,
2391
+ "step": 1520
2392
+ },
2393
+ {
2394
+ "epoch": 0.98,
2395
+ "learning_rate": 6.396932519840693e-09,
2396
+ "logits/chosen": -0.9894587397575378,
2397
+ "logits/rejected": -1.043691873550415,
2398
+ "logps/chosen": -284.725830078125,
2399
+ "logps/rejected": -302.0658874511719,
2400
+ "loss": 0.0885,
2401
+ "rewards/accuracies": 0.4375,
2402
+ "rewards/chosen": 0.01785745844244957,
2403
+ "rewards/margins": 0.0238940566778183,
2404
+ "rewards/rejected": -0.0060366010293364525,
2405
+ "step": 1530
2406
+ },
2407
+ {
2408
+ "epoch": 0.99,
2409
+ "learning_rate": 3.024230518515192e-09,
2410
+ "logits/chosen": -1.0197975635528564,
2411
+ "logits/rejected": -0.9348329305648804,
2412
+ "logps/chosen": -242.8181915283203,
2413
+ "logps/rejected": -304.18841552734375,
2414
+ "loss": 0.115,
2415
+ "rewards/accuracies": 0.44999998807907104,
2416
+ "rewards/chosen": 0.027777481824159622,
2417
+ "rewards/margins": 0.04096692427992821,
2418
+ "rewards/rejected": -0.013189440593123436,
2419
+ "step": 1540
2420
+ },
2421
+ {
2422
+ "epoch": 0.99,
2423
+ "learning_rate": 8.998985365679669e-10,
2424
+ "logits/chosen": -0.918280303478241,
2425
+ "logits/rejected": -0.9589518308639526,
2426
+ "logps/chosen": -293.98236083984375,
2427
+ "logps/rejected": -301.8656921386719,
2428
+ "loss": 0.0665,
2429
+ "rewards/accuracies": 0.4625000059604645,
2430
+ "rewards/chosen": 0.025340059772133827,
2431
+ "rewards/margins": 0.033479273319244385,
2432
+ "rewards/rejected": -0.008139212615787983,
2433
+ "step": 1550
2434
+ },
2435
+ {
2436
+ "epoch": 1.0,
2437
+ "learning_rate": 2.499863971494598e-11,
2438
+ "logits/chosen": -0.9552096128463745,
2439
+ "logits/rejected": -0.8985775113105774,
2440
+ "logps/chosen": -317.04779052734375,
2441
+ "logps/rejected": -310.51031494140625,
2442
+ "loss": 0.0696,
2443
+ "rewards/accuracies": 0.5249999761581421,
2444
+ "rewards/chosen": 0.025858771055936813,
2445
+ "rewards/margins": 0.03246783837676048,
2446
+ "rewards/rejected": -0.006609070114791393,
2447
+ "step": 1560
2448
+ },
2449
+ {
2450
+ "epoch": 1.0,
2451
+ "step": 1562,
2452
+ "total_flos": 0.0,
2453
+ "train_loss": 0.08419492204701015,
2454
+ "train_runtime": 22113.8812,
2455
+ "train_samples_per_second": 1.131,
2456
+ "train_steps_per_second": 0.071
2457
+ }
2458
+ ],
2459
+ "logging_steps": 10,
2460
+ "max_steps": 1562,
2461
+ "num_input_tokens_seen": 0,
2462
+ "num_train_epochs": 1,
2463
+ "save_steps": 100,
2464
+ "total_flos": 0.0,
2465
+ "train_batch_size": 4,
2466
+ "trial_name": null,
2467
+ "trial_params": null
2468
+ }