shenxq commited on
Commit
41409cb
1 Parent(s): 30a24a4

Model save

Browse files
README.md ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ base_model: mistralai/Mistral-7B-Instruct-v0.2
9
+ model-index:
10
+ - name: zephyr-7b-dpo-qlora-pairrm
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # zephyr-7b-dpo-qlora-pairrm
18
+
19
+ This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) on the None dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.6773
22
+ - Rewards/chosen: -1.5442
23
+ - Rewards/rejected: -1.6837
24
+ - Rewards/accuracies: 0.5687
25
+ - Rewards/margins: 0.1395
26
+ - Logps/rejected: -394.7031
27
+ - Logps/chosen: -375.1367
28
+ - Logits/rejected: -4.4436
29
+ - Logits/chosen: -4.4568
30
+
31
+ ## Model description
32
+
33
+ More information needed
34
+
35
+ ## Intended uses & limitations
36
+
37
+ More information needed
38
+
39
+ ## Training and evaluation data
40
+
41
+ More information needed
42
+
43
+ ## Training procedure
44
+
45
+ ### Training hyperparameters
46
+
47
+ The following hyperparameters were used during training:
48
+ - learning_rate: 5e-06
49
+ - train_batch_size: 4
50
+ - eval_batch_size: 8
51
+ - seed: 42
52
+ - distributed_type: multi-GPU
53
+ - gradient_accumulation_steps: 4
54
+ - total_train_batch_size: 16
55
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
56
+ - lr_scheduler_type: cosine
57
+ - lr_scheduler_warmup_ratio: 0.1
58
+ - num_epochs: 1
59
+
60
+ ### Training results
61
+
62
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
63
+ |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
64
+ | 0.6909 | 0.08 | 100 | 0.6921 | -0.0169 | -0.0192 | 0.5427 | 0.0023 | -228.2522 | -222.4038 | -2.6219 | -2.6247 |
65
+ | 0.684 | 0.16 | 200 | 0.6873 | -0.0803 | -0.0944 | 0.5567 | 0.0141 | -235.7721 | -228.7468 | -2.7599 | -2.7629 |
66
+ | 0.6795 | 0.24 | 300 | 0.6839 | -0.5138 | -0.5516 | 0.5460 | 0.0378 | -281.4856 | -272.0929 | -3.5141 | -3.5199 |
67
+ | 0.6561 | 0.32 | 400 | 0.6812 | -0.8158 | -0.8788 | 0.5573 | 0.0630 | -314.2105 | -302.2954 | -3.7484 | -3.7580 |
68
+ | 0.633 | 0.4 | 500 | 0.6787 | -0.9027 | -0.9810 | 0.5597 | 0.0782 | -324.4269 | -310.9858 | -4.0978 | -4.1077 |
69
+ | 0.6302 | 0.48 | 600 | 0.6785 | -1.1692 | -1.2692 | 0.5597 | 0.1000 | -353.2493 | -337.6355 | -4.4318 | -4.4435 |
70
+ | 0.5743 | 0.56 | 700 | 0.6835 | -1.5435 | -1.6640 | 0.5630 | 0.1205 | -392.7273 | -375.0575 | -4.5047 | -4.5182 |
71
+ | 0.6443 | 0.64 | 800 | 0.6779 | -1.3860 | -1.5069 | 0.5667 | 0.1209 | -377.0208 | -359.3108 | -4.2453 | -4.2572 |
72
+ | 0.6651 | 0.72 | 900 | 0.6819 | -1.6633 | -1.8040 | 0.5693 | 0.1408 | -406.7332 | -387.0414 | -4.6039 | -4.6178 |
73
+ | 0.5993 | 0.8 | 1000 | 0.6785 | -1.5776 | -1.7191 | 0.5683 | 0.1415 | -398.2364 | -378.4713 | -4.5356 | -4.5491 |
74
+ | 0.6759 | 0.88 | 1100 | 0.6778 | -1.5515 | -1.6923 | 0.5687 | 0.1408 | -395.5604 | -375.8654 | -4.4722 | -4.4855 |
75
+ | 0.6402 | 0.96 | 1200 | 0.6773 | -1.5439 | -1.6837 | 0.5690 | 0.1398 | -394.7002 | -375.1028 | -4.4444 | -4.4577 |
76
+
77
+
78
+ ### Framework versions
79
+
80
+ - PEFT 0.7.1
81
+ - Transformers 4.36.2
82
+ - Pytorch 2.1.2
83
+ - Datasets 2.14.6
84
+ - Tokenizers 0.15.0
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:955ca9f7427ed7184975097fd6189b6bd306dfa6a35b56bd0fcf872d711120a2
3
  size 1342238560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f66dedcfa579dcdfb9f1d78c397d7645e70f57c173df9dd20bfc82911ce23de6
3
  size 1342238560
all_results.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_logits/chosen": -4.456838607788086,
4
+ "eval_logits/rejected": -4.443554401397705,
5
+ "eval_logps/chosen": -375.13671875,
6
+ "eval_logps/rejected": -394.703125,
7
+ "eval_loss": 0.6773372888565063,
8
+ "eval_rewards/accuracies": 0.5686666369438171,
9
+ "eval_rewards/chosen": -1.5442434549331665,
10
+ "eval_rewards/margins": 0.13949742913246155,
11
+ "eval_rewards/rejected": -1.6837408542633057,
12
+ "eval_runtime": 1623.421,
13
+ "eval_samples": 2994,
14
+ "eval_samples_per_second": 1.844,
15
+ "eval_steps_per_second": 0.231,
16
+ "train_loss": 0.6475976420174225,
17
+ "train_runtime": 42677.4758,
18
+ "train_samples": 19996,
19
+ "train_samples_per_second": 0.469,
20
+ "train_steps_per_second": 0.029
21
+ }
eval_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_logits/chosen": -4.456838607788086,
4
+ "eval_logits/rejected": -4.443554401397705,
5
+ "eval_logps/chosen": -375.13671875,
6
+ "eval_logps/rejected": -394.703125,
7
+ "eval_loss": 0.6773372888565063,
8
+ "eval_rewards/accuracies": 0.5686666369438171,
9
+ "eval_rewards/chosen": -1.5442434549331665,
10
+ "eval_rewards/margins": 0.13949742913246155,
11
+ "eval_rewards/rejected": -1.6837408542633057,
12
+ "eval_runtime": 1623.421,
13
+ "eval_samples": 2994,
14
+ "eval_samples_per_second": 1.844,
15
+ "eval_steps_per_second": 0.231
16
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.6475976420174225,
4
+ "train_runtime": 42677.4758,
5
+ "train_samples": 19996,
6
+ "train_samples_per_second": 0.469,
7
+ "train_steps_per_second": 0.029
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1972 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9993998799759952,
5
+ "eval_steps": 100,
6
+ "global_step": 1249,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 4e-08,
14
+ "logits/chosen": -2.682399272918701,
15
+ "logits/rejected": -2.7047135829925537,
16
+ "logps/chosen": -275.10638427734375,
17
+ "logps/rejected": -271.8466491699219,
18
+ "loss": 0.6931,
19
+ "rewards/accuracies": 0.0,
20
+ "rewards/chosen": 0.0,
21
+ "rewards/margins": 0.0,
22
+ "rewards/rejected": 0.0,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 0.01,
27
+ "learning_rate": 4.0000000000000003e-07,
28
+ "logits/chosen": -2.606243848800659,
29
+ "logits/rejected": -2.633491277694702,
30
+ "logps/chosen": -301.7389831542969,
31
+ "logps/rejected": -324.2469787597656,
32
+ "loss": 0.6931,
33
+ "rewards/accuracies": 0.4166666567325592,
34
+ "rewards/chosen": 0.00018933300452772528,
35
+ "rewards/margins": 9.413135558133945e-06,
36
+ "rewards/rejected": 0.00017991992353927344,
37
+ "step": 10
38
+ },
39
+ {
40
+ "epoch": 0.02,
41
+ "learning_rate": 8.000000000000001e-07,
42
+ "logits/chosen": -2.5866377353668213,
43
+ "logits/rejected": -2.5900259017944336,
44
+ "logps/chosen": -269.0643615722656,
45
+ "logps/rejected": -289.1509094238281,
46
+ "loss": 0.6931,
47
+ "rewards/accuracies": 0.53125,
48
+ "rewards/chosen": -0.0011652575340121984,
49
+ "rewards/margins": 6.114605639595538e-05,
50
+ "rewards/rejected": -0.0012264035176485777,
51
+ "step": 20
52
+ },
53
+ {
54
+ "epoch": 0.02,
55
+ "learning_rate": 1.2000000000000002e-06,
56
+ "logits/chosen": -2.57534122467041,
57
+ "logits/rejected": -2.5880730152130127,
58
+ "logps/chosen": -291.5533752441406,
59
+ "logps/rejected": -311.4080505371094,
60
+ "loss": 0.693,
61
+ "rewards/accuracies": 0.46875,
62
+ "rewards/chosen": -0.0030827566515654325,
63
+ "rewards/margins": 0.0002471129409968853,
64
+ "rewards/rejected": -0.0033298698253929615,
65
+ "step": 30
66
+ },
67
+ {
68
+ "epoch": 0.03,
69
+ "learning_rate": 1.6000000000000001e-06,
70
+ "logits/chosen": -2.612988233566284,
71
+ "logits/rejected": -2.619412899017334,
72
+ "logps/chosen": -264.4569396972656,
73
+ "logps/rejected": -273.54107666015625,
74
+ "loss": 0.6927,
75
+ "rewards/accuracies": 0.5375000238418579,
76
+ "rewards/chosen": -0.005209661088883877,
77
+ "rewards/margins": 0.0008221397292800248,
78
+ "rewards/rejected": -0.006031800992786884,
79
+ "step": 40
80
+ },
81
+ {
82
+ "epoch": 0.04,
83
+ "learning_rate": 2.0000000000000003e-06,
84
+ "logits/chosen": -2.563633680343628,
85
+ "logits/rejected": -2.5437724590301514,
86
+ "logps/chosen": -264.8982849121094,
87
+ "logps/rejected": -269.68804931640625,
88
+ "loss": 0.6932,
89
+ "rewards/accuracies": 0.44999998807907104,
90
+ "rewards/chosen": -0.008133028633892536,
91
+ "rewards/margins": -1.2002186849713326e-05,
92
+ "rewards/rejected": -0.008121026679873466,
93
+ "step": 50
94
+ },
95
+ {
96
+ "epoch": 0.05,
97
+ "learning_rate": 2.4000000000000003e-06,
98
+ "logits/chosen": -2.6302762031555176,
99
+ "logits/rejected": -2.633596420288086,
100
+ "logps/chosen": -277.2352600097656,
101
+ "logps/rejected": -296.19476318359375,
102
+ "loss": 0.6921,
103
+ "rewards/accuracies": 0.550000011920929,
104
+ "rewards/chosen": -0.013791452161967754,
105
+ "rewards/margins": 0.0020633486565202475,
106
+ "rewards/rejected": -0.01585480198264122,
107
+ "step": 60
108
+ },
109
+ {
110
+ "epoch": 0.06,
111
+ "learning_rate": 2.8000000000000003e-06,
112
+ "logits/chosen": -2.6230902671813965,
113
+ "logits/rejected": -2.6135356426239014,
114
+ "logps/chosen": -280.782958984375,
115
+ "logps/rejected": -286.6590881347656,
116
+ "loss": 0.6931,
117
+ "rewards/accuracies": 0.4937500059604645,
118
+ "rewards/chosen": -0.01885022595524788,
119
+ "rewards/margins": 0.00015832395001780242,
120
+ "rewards/rejected": -0.01900855079293251,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.06,
125
+ "learning_rate": 3.2000000000000003e-06,
126
+ "logits/chosen": -2.6521055698394775,
127
+ "logits/rejected": -2.650635004043579,
128
+ "logps/chosen": -276.92156982421875,
129
+ "logps/rejected": -297.58477783203125,
130
+ "loss": 0.6924,
131
+ "rewards/accuracies": 0.5249999761581421,
132
+ "rewards/chosen": -0.017211003229022026,
133
+ "rewards/margins": 0.0015935760457068682,
134
+ "rewards/rejected": -0.018804579973220825,
135
+ "step": 80
136
+ },
137
+ {
138
+ "epoch": 0.07,
139
+ "learning_rate": 3.6000000000000003e-06,
140
+ "logits/chosen": -2.650332450866699,
141
+ "logits/rejected": -2.639291286468506,
142
+ "logps/chosen": -311.17218017578125,
143
+ "logps/rejected": -316.7694091796875,
144
+ "loss": 0.6891,
145
+ "rewards/accuracies": 0.59375,
146
+ "rewards/chosen": -0.017588406801223755,
147
+ "rewards/margins": 0.008481341414153576,
148
+ "rewards/rejected": -0.026069749146699905,
149
+ "step": 90
150
+ },
151
+ {
152
+ "epoch": 0.08,
153
+ "learning_rate": 4.000000000000001e-06,
154
+ "logits/chosen": -2.6498260498046875,
155
+ "logits/rejected": -2.6482200622558594,
156
+ "logps/chosen": -276.51043701171875,
157
+ "logps/rejected": -290.5121154785156,
158
+ "loss": 0.6909,
159
+ "rewards/accuracies": 0.550000011920929,
160
+ "rewards/chosen": -0.02430056408047676,
161
+ "rewards/margins": 0.004733243025839329,
162
+ "rewards/rejected": -0.029033806174993515,
163
+ "step": 100
164
+ },
165
+ {
166
+ "epoch": 0.08,
167
+ "eval_logits/chosen": -2.6246910095214844,
168
+ "eval_logits/rejected": -2.6219358444213867,
169
+ "eval_logps/chosen": -222.40379333496094,
170
+ "eval_logps/rejected": -228.25218200683594,
171
+ "eval_loss": 0.6920604109764099,
172
+ "eval_rewards/accuracies": 0.5426666736602783,
173
+ "eval_rewards/chosen": -0.01691427268087864,
174
+ "eval_rewards/margins": 0.0023173687513917685,
175
+ "eval_rewards/rejected": -0.01923164166510105,
176
+ "eval_runtime": 1621.9667,
177
+ "eval_samples_per_second": 1.846,
178
+ "eval_steps_per_second": 0.231,
179
+ "step": 100
180
+ },
181
+ {
182
+ "epoch": 0.09,
183
+ "learning_rate": 4.4e-06,
184
+ "logits/chosen": -2.6501355171203613,
185
+ "logits/rejected": -2.652731418609619,
186
+ "logps/chosen": -298.9729919433594,
187
+ "logps/rejected": -309.8643493652344,
188
+ "loss": 0.6891,
189
+ "rewards/accuracies": 0.6000000238418579,
190
+ "rewards/chosen": -0.025512468069791794,
191
+ "rewards/margins": 0.008652618154883385,
192
+ "rewards/rejected": -0.03416508436203003,
193
+ "step": 110
194
+ },
195
+ {
196
+ "epoch": 0.1,
197
+ "learning_rate": 4.800000000000001e-06,
198
+ "logits/chosen": -2.676758289337158,
199
+ "logits/rejected": -2.6891021728515625,
200
+ "logps/chosen": -278.479736328125,
201
+ "logps/rejected": -296.43212890625,
202
+ "loss": 0.6903,
203
+ "rewards/accuracies": 0.4937500059604645,
204
+ "rewards/chosen": -0.031918395310640335,
205
+ "rewards/margins": 0.006463131867349148,
206
+ "rewards/rejected": -0.03838152438402176,
207
+ "step": 120
208
+ },
209
+ {
210
+ "epoch": 0.1,
211
+ "learning_rate": 4.999755876225375e-06,
212
+ "logits/chosen": -2.645005702972412,
213
+ "logits/rejected": -2.62728214263916,
214
+ "logps/chosen": -294.44366455078125,
215
+ "logps/rejected": -315.0718994140625,
216
+ "loss": 0.6878,
217
+ "rewards/accuracies": 0.574999988079071,
218
+ "rewards/chosen": -0.035983841866254807,
219
+ "rewards/margins": 0.011375428177416325,
220
+ "rewards/rejected": -0.047359269112348557,
221
+ "step": 130
222
+ },
223
+ {
224
+ "epoch": 0.11,
225
+ "learning_rate": 4.997803172081864e-06,
226
+ "logits/chosen": -2.6803853511810303,
227
+ "logits/rejected": -2.680997371673584,
228
+ "logps/chosen": -289.1062927246094,
229
+ "logps/rejected": -302.7191467285156,
230
+ "loss": 0.6855,
231
+ "rewards/accuracies": 0.5625,
232
+ "rewards/chosen": -0.041079964488744736,
233
+ "rewards/margins": 0.016574053093791008,
234
+ "rewards/rejected": -0.057654011994600296,
235
+ "step": 140
236
+ },
237
+ {
238
+ "epoch": 0.12,
239
+ "learning_rate": 4.9938992891651825e-06,
240
+ "logits/chosen": -2.6616640090942383,
241
+ "logits/rejected": -2.6513876914978027,
242
+ "logps/chosen": -277.707763671875,
243
+ "logps/rejected": -300.9044494628906,
244
+ "loss": 0.683,
245
+ "rewards/accuracies": 0.6000000238418579,
246
+ "rewards/chosen": -0.05293840169906616,
247
+ "rewards/margins": 0.0216156505048275,
248
+ "rewards/rejected": -0.07455406337976456,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.13,
253
+ "learning_rate": 4.988047277024456e-06,
254
+ "logits/chosen": -2.7210304737091064,
255
+ "logits/rejected": -2.7316393852233887,
256
+ "logps/chosen": -288.5920715332031,
257
+ "logps/rejected": -304.19964599609375,
258
+ "loss": 0.6804,
259
+ "rewards/accuracies": 0.643750011920929,
260
+ "rewards/chosen": -0.07934443652629852,
261
+ "rewards/margins": 0.026776760816574097,
262
+ "rewards/rejected": -0.10612119734287262,
263
+ "step": 160
264
+ },
265
+ {
266
+ "epoch": 0.14,
267
+ "learning_rate": 4.980251707005417e-06,
268
+ "logits/chosen": -2.71783185005188,
269
+ "logits/rejected": -2.690868854522705,
270
+ "logps/chosen": -307.91888427734375,
271
+ "logps/rejected": -315.943359375,
272
+ "loss": 0.689,
273
+ "rewards/accuracies": 0.53125,
274
+ "rewards/chosen": -0.10906956344842911,
275
+ "rewards/margins": 0.009946177713572979,
276
+ "rewards/rejected": -0.11901573836803436,
277
+ "step": 170
278
+ },
279
+ {
280
+ "epoch": 0.14,
281
+ "learning_rate": 4.970518668679459e-06,
282
+ "logits/chosen": -2.729719638824463,
283
+ "logits/rejected": -2.714111804962158,
284
+ "logps/chosen": -304.322998046875,
285
+ "logps/rejected": -311.78167724609375,
286
+ "loss": 0.6816,
287
+ "rewards/accuracies": 0.574999988079071,
288
+ "rewards/chosen": -0.09336166828870773,
289
+ "rewards/margins": 0.0258515365421772,
290
+ "rewards/rejected": -0.11921320110559464,
291
+ "step": 180
292
+ },
293
+ {
294
+ "epoch": 0.15,
295
+ "learning_rate": 4.958855765086722e-06,
296
+ "logits/chosen": -2.757159471511841,
297
+ "logits/rejected": -2.7543435096740723,
298
+ "logps/chosen": -284.23687744140625,
299
+ "logps/rejected": -293.60595703125,
300
+ "loss": 0.6848,
301
+ "rewards/accuracies": 0.5562499761581421,
302
+ "rewards/chosen": -0.06431926041841507,
303
+ "rewards/margins": 0.019170444458723068,
304
+ "rewards/rejected": -0.08348970115184784,
305
+ "step": 190
306
+ },
307
+ {
308
+ "epoch": 0.16,
309
+ "learning_rate": 4.945272106796919e-06,
310
+ "logits/chosen": -2.770078420639038,
311
+ "logits/rejected": -2.7745845317840576,
312
+ "logps/chosen": -285.0936584472656,
313
+ "logps/rejected": -300.80975341796875,
314
+ "loss": 0.684,
315
+ "rewards/accuracies": 0.5874999761581421,
316
+ "rewards/chosen": -0.0751299113035202,
317
+ "rewards/margins": 0.02216259017586708,
318
+ "rewards/rejected": -0.09729250520467758,
319
+ "step": 200
320
+ },
321
+ {
322
+ "epoch": 0.16,
323
+ "eval_logits/chosen": -2.762911558151245,
324
+ "eval_logits/rejected": -2.759880542755127,
325
+ "eval_logps/chosen": -228.746826171875,
326
+ "eval_logps/rejected": -235.77212524414062,
327
+ "eval_loss": 0.6872997283935547,
328
+ "eval_rewards/accuracies": 0.5566666722297668,
329
+ "eval_rewards/chosen": -0.08034466207027435,
330
+ "eval_rewards/margins": 0.014086335897445679,
331
+ "eval_rewards/rejected": -0.09443099796772003,
332
+ "eval_runtime": 1664.3062,
333
+ "eval_samples_per_second": 1.799,
334
+ "eval_steps_per_second": 0.225,
335
+ "step": 200
336
+ },
337
+ {
338
+ "epoch": 0.17,
339
+ "learning_rate": 4.929778304792537e-06,
340
+ "logits/chosen": -2.7531464099884033,
341
+ "logits/rejected": -2.7568936347961426,
342
+ "logps/chosen": -310.95513916015625,
343
+ "logps/rejected": -315.43353271484375,
344
+ "loss": 0.6749,
345
+ "rewards/accuracies": 0.612500011920929,
346
+ "rewards/chosen": -0.08977816253900528,
347
+ "rewards/margins": 0.044371671974658966,
348
+ "rewards/rejected": -0.13414981961250305,
349
+ "step": 210
350
+ },
351
+ {
352
+ "epoch": 0.18,
353
+ "learning_rate": 4.912386462179987e-06,
354
+ "logits/chosen": -2.7818262577056885,
355
+ "logits/rejected": -2.777902126312256,
356
+ "logps/chosen": -298.80780029296875,
357
+ "logps/rejected": -325.99853515625,
358
+ "loss": 0.6741,
359
+ "rewards/accuracies": 0.606249988079071,
360
+ "rewards/chosen": -0.10158131271600723,
361
+ "rewards/margins": 0.044485487043857574,
362
+ "rewards/rejected": -0.1460667848587036,
363
+ "step": 220
364
+ },
365
+ {
366
+ "epoch": 0.18,
367
+ "learning_rate": 4.893110164735167e-06,
368
+ "logits/chosen": -2.8827590942382812,
369
+ "logits/rejected": -2.8796629905700684,
370
+ "logps/chosen": -305.14068603515625,
371
+ "logps/rejected": -319.39471435546875,
372
+ "loss": 0.6796,
373
+ "rewards/accuracies": 0.543749988079071,
374
+ "rewards/chosen": -0.1311652809381485,
375
+ "rewards/margins": 0.0319681391119957,
376
+ "rewards/rejected": -0.1631334125995636,
377
+ "step": 230
378
+ },
379
+ {
380
+ "epoch": 0.19,
381
+ "learning_rate": 4.871964470290823e-06,
382
+ "logits/chosen": -2.90864634513855,
383
+ "logits/rejected": -2.9260551929473877,
384
+ "logps/chosen": -309.19970703125,
385
+ "logps/rejected": -332.04119873046875,
386
+ "loss": 0.6654,
387
+ "rewards/accuracies": 0.643750011920929,
388
+ "rewards/chosen": -0.1595773994922638,
389
+ "rewards/margins": 0.06538807600736618,
390
+ "rewards/rejected": -0.22496548295021057,
391
+ "step": 240
392
+ },
393
+ {
394
+ "epoch": 0.2,
395
+ "learning_rate": 4.848965896974006e-06,
396
+ "logits/chosen": -2.947906494140625,
397
+ "logits/rejected": -2.940717935562134,
398
+ "logps/chosen": -302.98651123046875,
399
+ "logps/rejected": -325.380859375,
400
+ "loss": 0.6756,
401
+ "rewards/accuracies": 0.5562499761581421,
402
+ "rewards/chosen": -0.19427716732025146,
403
+ "rewards/margins": 0.043551910668611526,
404
+ "rewards/rejected": -0.2378290891647339,
405
+ "step": 250
406
+ },
407
+ {
408
+ "epoch": 0.21,
409
+ "learning_rate": 4.8241324103028055e-06,
410
+ "logits/chosen": -3.116504430770874,
411
+ "logits/rejected": -3.088792324066162,
412
+ "logps/chosen": -312.82159423828125,
413
+ "logps/rejected": -328.6378479003906,
414
+ "loss": 0.6614,
415
+ "rewards/accuracies": 0.606249988079071,
416
+ "rewards/chosen": -0.24597156047821045,
417
+ "rewards/margins": 0.07650710642337799,
418
+ "rewards/rejected": -0.322478711605072,
419
+ "step": 260
420
+ },
421
+ {
422
+ "epoch": 0.22,
423
+ "learning_rate": 4.797483409152438e-06,
424
+ "logits/chosen": -3.2219741344451904,
425
+ "logits/rejected": -3.211695432662964,
426
+ "logps/chosen": -308.4544372558594,
427
+ "logps/rejected": -333.4085388183594,
428
+ "loss": 0.6625,
429
+ "rewards/accuracies": 0.581250011920929,
430
+ "rewards/chosen": -0.3319942355155945,
431
+ "rewards/margins": 0.08054188638925552,
432
+ "rewards/rejected": -0.412536084651947,
433
+ "step": 270
434
+ },
435
+ {
436
+ "epoch": 0.22,
437
+ "learning_rate": 4.769039710601669e-06,
438
+ "logits/chosen": -3.368110179901123,
439
+ "logits/rejected": -3.3736987113952637,
440
+ "logps/chosen": -316.3270263671875,
441
+ "logps/rejected": -338.3800048828125,
442
+ "loss": 0.6608,
443
+ "rewards/accuracies": 0.6499999761581421,
444
+ "rewards/chosen": -0.42326441407203674,
445
+ "rewards/margins": 0.08332939445972443,
446
+ "rewards/rejected": -0.5065938234329224,
447
+ "step": 280
448
+ },
449
+ {
450
+ "epoch": 0.23,
451
+ "learning_rate": 4.738823533671383e-06,
452
+ "logits/chosen": -3.503385543823242,
453
+ "logits/rejected": -3.490826368331909,
454
+ "logps/chosen": -351.4194030761719,
455
+ "logps/rejected": -368.7189025878906,
456
+ "loss": 0.6789,
457
+ "rewards/accuracies": 0.59375,
458
+ "rewards/chosen": -0.5290869474411011,
459
+ "rewards/margins": 0.048080917447805405,
460
+ "rewards/rejected": -0.5771678686141968,
461
+ "step": 290
462
+ },
463
+ {
464
+ "epoch": 0.24,
465
+ "learning_rate": 4.706858481968017e-06,
466
+ "logits/chosen": -3.464003801345825,
467
+ "logits/rejected": -3.469198226928711,
468
+ "logps/chosen": -340.75830078125,
469
+ "logps/rejected": -352.869384765625,
470
+ "loss": 0.6795,
471
+ "rewards/accuracies": 0.550000011920929,
472
+ "rewards/chosen": -0.6002467274665833,
473
+ "rewards/margins": 0.04778647795319557,
474
+ "rewards/rejected": -0.6480332612991333,
475
+ "step": 300
476
+ },
477
+ {
478
+ "epoch": 0.24,
479
+ "eval_logits/chosen": -3.5198659896850586,
480
+ "eval_logits/rejected": -3.5140511989593506,
481
+ "eval_logps/chosen": -272.0929260253906,
482
+ "eval_logps/rejected": -281.4856262207031,
483
+ "eval_loss": 0.683901846408844,
484
+ "eval_rewards/accuracies": 0.5460000038146973,
485
+ "eval_rewards/chosen": -0.5138051509857178,
486
+ "eval_rewards/margins": 0.037760715931653976,
487
+ "eval_rewards/rejected": -0.551565945148468,
488
+ "eval_runtime": 1688.0776,
489
+ "eval_samples_per_second": 1.774,
490
+ "eval_steps_per_second": 0.222,
491
+ "step": 300
492
+ },
493
+ {
494
+ "epoch": 0.25,
495
+ "learning_rate": 4.673169525245416e-06,
496
+ "logits/chosen": -3.4468257427215576,
497
+ "logits/rejected": -3.422842502593994,
498
+ "logps/chosen": -337.1869812011719,
499
+ "logps/rejected": -369.9530029296875,
500
+ "loss": 0.6606,
501
+ "rewards/accuracies": 0.6000000238418579,
502
+ "rewards/chosen": -0.5638504028320312,
503
+ "rewards/margins": 0.09270543605089188,
504
+ "rewards/rejected": -0.6565557718276978,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 0.26,
509
+ "learning_rate": 4.63778297989952e-06,
510
+ "logits/chosen": -3.598461866378784,
511
+ "logits/rejected": -3.5812110900878906,
512
+ "logps/chosen": -344.33819580078125,
513
+ "logps/rejected": -364.0965270996094,
514
+ "loss": 0.6658,
515
+ "rewards/accuracies": 0.612500011920929,
516
+ "rewards/chosen": -0.5991551280021667,
517
+ "rewards/margins": 0.08249818533658981,
518
+ "rewards/rejected": -0.6816532015800476,
519
+ "step": 320
520
+ },
521
+ {
522
+ "epoch": 0.26,
523
+ "learning_rate": 4.60072648841109e-06,
524
+ "logits/chosen": -3.7547969818115234,
525
+ "logits/rejected": -3.74609637260437,
526
+ "logps/chosen": -364.9189758300781,
527
+ "logps/rejected": -394.4504089355469,
528
+ "loss": 0.6271,
529
+ "rewards/accuracies": 0.699999988079071,
530
+ "rewards/chosen": -0.7271077632904053,
531
+ "rewards/margins": 0.17919641733169556,
532
+ "rewards/rejected": -0.9063041806221008,
533
+ "step": 330
534
+ },
535
+ {
536
+ "epoch": 0.27,
537
+ "learning_rate": 4.562028997752574e-06,
538
+ "logits/chosen": -3.9404075145721436,
539
+ "logits/rejected": -3.9303627014160156,
540
+ "logps/chosen": -380.1604919433594,
541
+ "logps/rejected": -408.8302307128906,
542
+ "loss": 0.676,
543
+ "rewards/accuracies": 0.5874999761581421,
544
+ "rewards/chosen": -0.8844378590583801,
545
+ "rewards/margins": 0.08313676714897156,
546
+ "rewards/rejected": -0.9675747156143188,
547
+ "step": 340
548
+ },
549
+ {
550
+ "epoch": 0.28,
551
+ "learning_rate": 4.521720736775947e-06,
552
+ "logits/chosen": -3.974989414215088,
553
+ "logits/rejected": -3.998753786087036,
554
+ "logps/chosen": -397.6668701171875,
555
+ "logps/rejected": -411.4002990722656,
556
+ "loss": 0.6559,
557
+ "rewards/accuracies": 0.625,
558
+ "rewards/chosen": -1.0197855234146118,
559
+ "rewards/margins": 0.12257959693670273,
560
+ "rewards/rejected": -1.1423652172088623,
561
+ "step": 350
562
+ },
563
+ {
564
+ "epoch": 0.29,
565
+ "learning_rate": 4.479833192599198e-06,
566
+ "logits/chosen": -3.942905902862549,
567
+ "logits/rejected": -3.9193332195281982,
568
+ "logps/chosen": -387.99420166015625,
569
+ "logps/rejected": -410.8706970214844,
570
+ "loss": 0.6613,
571
+ "rewards/accuracies": 0.581250011920929,
572
+ "rewards/chosen": -0.8740938305854797,
573
+ "rewards/margins": 0.10989212989807129,
574
+ "rewards/rejected": -0.9839859008789062,
575
+ "step": 360
576
+ },
577
+ {
578
+ "epoch": 0.3,
579
+ "learning_rate": 4.436399086009928e-06,
580
+ "logits/chosen": -3.781745195388794,
581
+ "logits/rejected": -3.746504306793213,
582
+ "logps/chosen": -363.17657470703125,
583
+ "logps/rejected": -384.13323974609375,
584
+ "loss": 0.6487,
585
+ "rewards/accuracies": 0.6625000238418579,
586
+ "rewards/chosen": -0.7560319304466248,
587
+ "rewards/margins": 0.12066509574651718,
588
+ "rewards/rejected": -0.8766969442367554,
589
+ "step": 370
590
+ },
591
+ {
592
+ "epoch": 0.3,
593
+ "learning_rate": 4.391452345905239e-06,
594
+ "logits/chosen": -3.672318696975708,
595
+ "logits/rejected": -3.6834559440612793,
596
+ "logps/chosen": -373.22344970703125,
597
+ "logps/rejected": -389.52777099609375,
598
+ "loss": 0.6611,
599
+ "rewards/accuracies": 0.5625,
600
+ "rewards/chosen": -0.8263392448425293,
601
+ "rewards/margins": 0.11394073814153671,
602
+ "rewards/rejected": -0.9402799606323242,
603
+ "step": 380
604
+ },
605
+ {
606
+ "epoch": 0.31,
607
+ "learning_rate": 4.3450280827879125e-06,
608
+ "logits/chosen": -3.7310726642608643,
609
+ "logits/rejected": -3.7564334869384766,
610
+ "logps/chosen": -374.36260986328125,
611
+ "logps/rejected": -394.80010986328125,
612
+ "loss": 0.6607,
613
+ "rewards/accuracies": 0.6187499761581421,
614
+ "rewards/chosen": -0.8552727699279785,
615
+ "rewards/margins": 0.11295287311077118,
616
+ "rewards/rejected": -0.9682257771492004,
617
+ "step": 390
618
+ },
619
+ {
620
+ "epoch": 0.32,
621
+ "learning_rate": 4.297162561339554e-06,
622
+ "logits/chosen": -3.6382896900177,
623
+ "logits/rejected": -3.6042380332946777,
624
+ "logps/chosen": -380.48590087890625,
625
+ "logps/rejected": -407.5166015625,
626
+ "loss": 0.6561,
627
+ "rewards/accuracies": 0.6187499761581421,
628
+ "rewards/chosen": -0.8749423027038574,
629
+ "rewards/margins": 0.1411461979150772,
630
+ "rewards/rejected": -1.016088604927063,
631
+ "step": 400
632
+ },
633
+ {
634
+ "epoch": 0.32,
635
+ "eval_logits/chosen": -3.7580296993255615,
636
+ "eval_logits/rejected": -3.7484097480773926,
637
+ "eval_logps/chosen": -302.29541015625,
638
+ "eval_logps/rejected": -314.21051025390625,
639
+ "eval_loss": 0.6812021136283875,
640
+ "eval_rewards/accuracies": 0.5573333501815796,
641
+ "eval_rewards/chosen": -0.815830409526825,
642
+ "eval_rewards/margins": 0.06298430263996124,
643
+ "eval_rewards/rejected": -0.8788146376609802,
644
+ "eval_runtime": 1618.6439,
645
+ "eval_samples_per_second": 1.85,
646
+ "eval_steps_per_second": 0.232,
647
+ "step": 400
648
+ },
649
+ {
650
+ "epoch": 0.33,
651
+ "learning_rate": 4.247893172092157e-06,
652
+ "logits/chosen": -3.615405559539795,
653
+ "logits/rejected": -3.612015962600708,
654
+ "logps/chosen": -370.3289794921875,
655
+ "logps/rejected": -405.1226806640625,
656
+ "loss": 0.6581,
657
+ "rewards/accuracies": 0.6312500238418579,
658
+ "rewards/chosen": -0.9003459215164185,
659
+ "rewards/margins": 0.13107234239578247,
660
+ "rewards/rejected": -1.0314182043075562,
661
+ "step": 410
662
+ },
663
+ {
664
+ "epoch": 0.34,
665
+ "learning_rate": 4.197258402220187e-06,
666
+ "logits/chosen": -3.65478515625,
667
+ "logits/rejected": -3.6666762828826904,
668
+ "logps/chosen": -379.541748046875,
669
+ "logps/rejected": -418.70880126953125,
670
+ "loss": 0.657,
671
+ "rewards/accuracies": 0.6000000238418579,
672
+ "rewards/chosen": -0.9110058546066284,
673
+ "rewards/margins": 0.3442539870738983,
674
+ "rewards/rejected": -1.2552598714828491,
675
+ "step": 420
676
+ },
677
+ {
678
+ "epoch": 0.34,
679
+ "learning_rate": 4.145297805476023e-06,
680
+ "logits/chosen": -3.6817328929901123,
681
+ "logits/rejected": -3.688814640045166,
682
+ "logps/chosen": -376.09783935546875,
683
+ "logps/rejected": -406.4654846191406,
684
+ "loss": 0.6439,
685
+ "rewards/accuracies": 0.637499988079071,
686
+ "rewards/chosen": -0.8991987109184265,
687
+ "rewards/margins": 0.1862173080444336,
688
+ "rewards/rejected": -1.0854160785675049,
689
+ "step": 430
690
+ },
691
+ {
692
+ "epoch": 0.35,
693
+ "learning_rate": 4.092051971292228e-06,
694
+ "logits/chosen": -3.716754198074341,
695
+ "logits/rejected": -3.7106194496154785,
696
+ "logps/chosen": -376.1932067871094,
697
+ "logps/rejected": -401.6561584472656,
698
+ "loss": 0.6554,
699
+ "rewards/accuracies": 0.643750011920929,
700
+ "rewards/chosen": -0.954433798789978,
701
+ "rewards/margins": 0.13505356013774872,
702
+ "rewards/rejected": -1.0894873142242432,
703
+ "step": 440
704
+ },
705
+ {
706
+ "epoch": 0.36,
707
+ "learning_rate": 4.037562493074792e-06,
708
+ "logits/chosen": -3.8129425048828125,
709
+ "logits/rejected": -3.833683729171753,
710
+ "logps/chosen": -403.4329528808594,
711
+ "logps/rejected": -423.4547424316406,
712
+ "loss": 0.6365,
713
+ "rewards/accuracies": 0.6812499761581421,
714
+ "rewards/chosen": -1.0463995933532715,
715
+ "rewards/margins": 0.19165393710136414,
716
+ "rewards/rejected": -1.2380534410476685,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 0.37,
721
+ "learning_rate": 3.981871935712112e-06,
722
+ "logits/chosen": -3.983973741531372,
723
+ "logits/rejected": -3.940070629119873,
724
+ "logps/chosen": -385.3892517089844,
725
+ "logps/rejected": -416.1813049316406,
726
+ "loss": 0.6533,
727
+ "rewards/accuracies": 0.5687500238418579,
728
+ "rewards/chosen": -1.0447807312011719,
729
+ "rewards/margins": 0.14543746411800385,
730
+ "rewards/rejected": -1.190218210220337,
731
+ "step": 460
732
+ },
733
+ {
734
+ "epoch": 0.38,
735
+ "learning_rate": 3.925023802325094e-06,
736
+ "logits/chosen": -4.042995929718018,
737
+ "logits/rejected": -4.0136399269104,
738
+ "logps/chosen": -401.97381591796875,
739
+ "logps/rejected": -437.3184509277344,
740
+ "loss": 0.6455,
741
+ "rewards/accuracies": 0.6187499761581421,
742
+ "rewards/chosen": -1.0883817672729492,
743
+ "rewards/margins": 0.2549799978733063,
744
+ "rewards/rejected": -1.343361735343933,
745
+ "step": 470
746
+ },
747
+ {
748
+ "epoch": 0.38,
749
+ "learning_rate": 3.867062500284342e-06,
750
+ "logits/chosen": -4.073556423187256,
751
+ "logits/rejected": -4.043025493621826,
752
+ "logps/chosen": -388.99090576171875,
753
+ "logps/rejected": -425.56787109375,
754
+ "loss": 0.6433,
755
+ "rewards/accuracies": 0.612500011920929,
756
+ "rewards/chosen": -1.1277819871902466,
757
+ "rewards/margins": 0.1785212755203247,
758
+ "rewards/rejected": -1.3063032627105713,
759
+ "step": 480
760
+ },
761
+ {
762
+ "epoch": 0.39,
763
+ "learning_rate": 3.8080333065209885e-06,
764
+ "logits/chosen": -4.076624393463135,
765
+ "logits/rejected": -4.084932327270508,
766
+ "logps/chosen": -391.6429138183594,
767
+ "logps/rejected": -392.52362060546875,
768
+ "loss": 0.7064,
769
+ "rewards/accuracies": 0.53125,
770
+ "rewards/chosen": -1.1167463064193726,
771
+ "rewards/margins": 0.06357622146606445,
772
+ "rewards/rejected": -1.180322527885437,
773
+ "step": 490
774
+ },
775
+ {
776
+ "epoch": 0.4,
777
+ "learning_rate": 3.7479823321582624e-06,
778
+ "logits/chosen": -3.93993878364563,
779
+ "logits/rejected": -3.9026870727539062,
780
+ "logps/chosen": -378.84918212890625,
781
+ "logps/rejected": -416.9828186035156,
782
+ "loss": 0.633,
783
+ "rewards/accuracies": 0.6187499761581421,
784
+ "rewards/chosen": -0.9214572906494141,
785
+ "rewards/margins": 0.19020086526870728,
786
+ "rewards/rejected": -1.1116580963134766,
787
+ "step": 500
788
+ },
789
+ {
790
+ "epoch": 0.4,
791
+ "eval_logits/chosen": -4.107741355895996,
792
+ "eval_logits/rejected": -4.097755432128906,
793
+ "eval_logps/chosen": -310.98577880859375,
794
+ "eval_logps/rejected": -324.42694091796875,
795
+ "eval_loss": 0.678744912147522,
796
+ "eval_rewards/accuracies": 0.5596666932106018,
797
+ "eval_rewards/chosen": -0.9027342796325684,
798
+ "eval_rewards/margins": 0.0782446414232254,
799
+ "eval_rewards/rejected": -0.9809789061546326,
800
+ "eval_runtime": 1619.4632,
801
+ "eval_samples_per_second": 1.849,
802
+ "eval_steps_per_second": 0.232,
803
+ "step": 500
804
+ },
805
+ {
806
+ "epoch": 0.41,
807
+ "learning_rate": 3.686956486491419e-06,
808
+ "logits/chosen": -3.9462807178497314,
809
+ "logits/rejected": -3.941249132156372,
810
+ "logps/chosen": -386.48590087890625,
811
+ "logps/rejected": -424.64508056640625,
812
+ "loss": 0.6313,
813
+ "rewards/accuracies": 0.637499988079071,
814
+ "rewards/chosen": -0.9227803349494934,
815
+ "rewards/margins": 0.29515519738197327,
816
+ "rewards/rejected": -1.217935562133789,
817
+ "step": 510
818
+ },
819
+ {
820
+ "epoch": 0.42,
821
+ "learning_rate": 3.625003440344166e-06,
822
+ "logits/chosen": -4.044493198394775,
823
+ "logits/rejected": -4.073317527770996,
824
+ "logps/chosen": -369.5896301269531,
825
+ "logps/rejected": -382.105224609375,
826
+ "loss": 0.664,
827
+ "rewards/accuracies": 0.6000000238418579,
828
+ "rewards/chosen": -0.9153301119804382,
829
+ "rewards/margins": 0.08651997148990631,
830
+ "rewards/rejected": -1.0018501281738281,
831
+ "step": 520
832
+ },
833
+ {
834
+ "epoch": 0.42,
835
+ "learning_rate": 3.562171588830231e-06,
836
+ "logits/chosen": -4.016448497772217,
837
+ "logits/rejected": -3.9958884716033936,
838
+ "logps/chosen": -377.91912841796875,
839
+ "logps/rejected": -404.9534606933594,
840
+ "loss": 0.6943,
841
+ "rewards/accuracies": 0.5625,
842
+ "rewards/chosen": -0.8825477361679077,
843
+ "rewards/margins": 0.0769728347659111,
844
+ "rewards/rejected": -0.9595205187797546,
845
+ "step": 530
846
+ },
847
+ {
848
+ "epoch": 0.43,
849
+ "learning_rate": 3.4985100135491245e-06,
850
+ "logits/chosen": -4.008540630340576,
851
+ "logits/rejected": -3.9679737091064453,
852
+ "logps/chosen": -382.65924072265625,
853
+ "logps/rejected": -425.71954345703125,
854
+ "loss": 0.629,
855
+ "rewards/accuracies": 0.6499999761581421,
856
+ "rewards/chosen": -0.8818261027336121,
857
+ "rewards/margins": 0.2214728146791458,
858
+ "rewards/rejected": -1.1032989025115967,
859
+ "step": 540
860
+ },
861
+ {
862
+ "epoch": 0.44,
863
+ "learning_rate": 3.4340684442456673e-06,
864
+ "logits/chosen": -4.049837589263916,
865
+ "logits/rejected": -4.043179035186768,
866
+ "logps/chosen": -384.42938232421875,
867
+ "logps/rejected": -410.5345153808594,
868
+ "loss": 0.6434,
869
+ "rewards/accuracies": 0.6187499761581421,
870
+ "rewards/chosen": -0.9648186564445496,
871
+ "rewards/margins": 0.1496874988079071,
872
+ "rewards/rejected": -1.1145063638687134,
873
+ "step": 550
874
+ },
875
+ {
876
+ "epoch": 0.45,
877
+ "learning_rate": 3.3688972199631974e-06,
878
+ "logits/chosen": -4.042217254638672,
879
+ "logits/rejected": -4.064842224121094,
880
+ "logps/chosen": -387.6910705566406,
881
+ "logps/rejected": -413.78662109375,
882
+ "loss": 0.6329,
883
+ "rewards/accuracies": 0.6812499761581421,
884
+ "rewards/chosen": -0.9935464859008789,
885
+ "rewards/margins": 0.24879872798919678,
886
+ "rewards/rejected": -1.2423454523086548,
887
+ "step": 560
888
+ },
889
+ {
890
+ "epoch": 0.46,
891
+ "learning_rate": 3.3030472497208354e-06,
892
+ "logits/chosen": -4.0646257400512695,
893
+ "logits/rejected": -4.018919944763184,
894
+ "logps/chosen": -385.509765625,
895
+ "logps/rejected": -450.21429443359375,
896
+ "loss": 0.6219,
897
+ "rewards/accuracies": 0.625,
898
+ "rewards/chosen": -1.0814400911331177,
899
+ "rewards/margins": 0.26284489035606384,
900
+ "rewards/rejected": -1.3442847728729248,
901
+ "step": 570
902
+ },
903
+ {
904
+ "epoch": 0.46,
905
+ "learning_rate": 3.236569972745492e-06,
906
+ "logits/chosen": -4.145129203796387,
907
+ "logits/rejected": -4.1337690353393555,
908
+ "logps/chosen": -380.5148620605469,
909
+ "logps/rejected": -406.1288146972656,
910
+ "loss": 0.6628,
911
+ "rewards/accuracies": 0.65625,
912
+ "rewards/chosen": -1.0649586915969849,
913
+ "rewards/margins": 0.15909543633460999,
914
+ "rewards/rejected": -1.2240540981292725,
915
+ "step": 580
916
+ },
917
+ {
918
+ "epoch": 0.47,
919
+ "learning_rate": 3.1695173182897126e-06,
920
+ "logits/chosen": -4.133418083190918,
921
+ "logits/rejected": -4.11216402053833,
922
+ "logps/chosen": -398.23626708984375,
923
+ "logps/rejected": -435.56646728515625,
924
+ "loss": 0.6554,
925
+ "rewards/accuracies": 0.6312500238418579,
926
+ "rewards/chosen": -1.1820625066757202,
927
+ "rewards/margins": 0.16709741950035095,
928
+ "rewards/rejected": -1.349160075187683,
929
+ "step": 590
930
+ },
931
+ {
932
+ "epoch": 0.48,
933
+ "learning_rate": 3.10194166506673e-06,
934
+ "logits/chosen": -4.175902366638184,
935
+ "logits/rejected": -4.1359357833862305,
936
+ "logps/chosen": -390.8631286621094,
937
+ "logps/rejected": -439.28436279296875,
938
+ "loss": 0.6302,
939
+ "rewards/accuracies": 0.6312500238418579,
940
+ "rewards/chosen": -1.061226487159729,
941
+ "rewards/margins": 0.2546694874763489,
942
+ "rewards/rejected": -1.3158957958221436,
943
+ "step": 600
944
+ },
945
+ {
946
+ "epoch": 0.48,
947
+ "eval_logits/chosen": -4.443523406982422,
948
+ "eval_logits/rejected": -4.431849002838135,
949
+ "eval_logps/chosen": -337.6354675292969,
950
+ "eval_logps/rejected": -353.24932861328125,
951
+ "eval_loss": 0.6784851551055908,
952
+ "eval_rewards/accuracies": 0.5596666932106018,
953
+ "eval_rewards/chosen": -1.1692306995391846,
954
+ "eval_rewards/margins": 0.09997232258319855,
955
+ "eval_rewards/rejected": -1.2692030668258667,
956
+ "eval_runtime": 1619.3984,
957
+ "eval_samples_per_second": 1.849,
958
+ "eval_steps_per_second": 0.232,
959
+ "step": 600
960
+ },
961
+ {
962
+ "epoch": 0.49,
963
+ "learning_rate": 3.0338958003344115e-06,
964
+ "logits/chosen": -4.3245649337768555,
965
+ "logits/rejected": -4.272718906402588,
966
+ "logps/chosen": -396.521240234375,
967
+ "logps/rejected": -432.1895446777344,
968
+ "loss": 0.6883,
969
+ "rewards/accuracies": 0.675000011920929,
970
+ "rewards/chosen": -1.2242915630340576,
971
+ "rewards/margins": 0.24001073837280273,
972
+ "rewards/rejected": -1.4643023014068604,
973
+ "step": 610
974
+ },
975
+ {
976
+ "epoch": 0.5,
977
+ "learning_rate": 2.9654328786600823e-06,
978
+ "logits/chosen": -4.306203365325928,
979
+ "logits/rejected": -4.252989768981934,
980
+ "logps/chosen": -397.3540344238281,
981
+ "logps/rejected": -442.0118103027344,
982
+ "loss": 0.6197,
983
+ "rewards/accuracies": 0.6937500238418579,
984
+ "rewards/chosen": -1.2108550071716309,
985
+ "rewards/margins": 0.21409356594085693,
986
+ "rewards/rejected": -1.4249485731124878,
987
+ "step": 620
988
+ },
989
+ {
990
+ "epoch": 0.5,
991
+ "learning_rate": 2.896606380398402e-06,
992
+ "logits/chosen": -4.365767478942871,
993
+ "logits/rejected": -4.406495094299316,
994
+ "logps/chosen": -417.7538146972656,
995
+ "logps/rejected": -443.05548095703125,
996
+ "loss": 0.6623,
997
+ "rewards/accuracies": 0.643750011920929,
998
+ "rewards/chosen": -1.3476839065551758,
999
+ "rewards/margins": 0.18296115100383759,
1000
+ "rewards/rejected": -1.5306451320648193,
1001
+ "step": 630
1002
+ },
1003
+ {
1004
+ "epoch": 0.51,
1005
+ "learning_rate": 2.827470069914772e-06,
1006
+ "logits/chosen": -4.2744035720825195,
1007
+ "logits/rejected": -4.236593723297119,
1008
+ "logps/chosen": -425.37908935546875,
1009
+ "logps/rejected": -453.02862548828125,
1010
+ "loss": 0.6756,
1011
+ "rewards/accuracies": 0.574999988079071,
1012
+ "rewards/chosen": -1.3940837383270264,
1013
+ "rewards/margins": 0.14192768931388855,
1014
+ "rewards/rejected": -1.5360115766525269,
1015
+ "step": 640
1016
+ },
1017
+ {
1018
+ "epoch": 0.52,
1019
+ "learning_rate": 2.7580779535868675e-06,
1020
+ "logits/chosen": -4.252664089202881,
1021
+ "logits/rejected": -4.254392147064209,
1022
+ "logps/chosen": -409.19378662109375,
1023
+ "logps/rejected": -438.70556640625,
1024
+ "loss": 0.6575,
1025
+ "rewards/accuracies": 0.675000011920929,
1026
+ "rewards/chosen": -1.230445384979248,
1027
+ "rewards/margins": 0.18368306756019592,
1028
+ "rewards/rejected": -1.4141284227371216,
1029
+ "step": 650
1030
+ },
1031
+ {
1032
+ "epoch": 0.53,
1033
+ "learning_rate": 2.688484237617129e-06,
1034
+ "logits/chosen": -4.151357650756836,
1035
+ "logits/rejected": -4.122767925262451,
1036
+ "logps/chosen": -400.3338928222656,
1037
+ "logps/rejected": -436.16571044921875,
1038
+ "loss": 0.6222,
1039
+ "rewards/accuracies": 0.65625,
1040
+ "rewards/chosen": -1.108682632446289,
1041
+ "rewards/margins": 0.24583642184734344,
1042
+ "rewards/rejected": -1.3545191287994385,
1043
+ "step": 660
1044
+ },
1045
+ {
1046
+ "epoch": 0.54,
1047
+ "learning_rate": 2.6187432856891585e-06,
1048
+ "logits/chosen": -4.1051225662231445,
1049
+ "logits/rejected": -4.0978288650512695,
1050
+ "logps/chosen": -407.8655700683594,
1051
+ "logps/rejected": -446.1006774902344,
1052
+ "loss": 0.6432,
1053
+ "rewards/accuracies": 0.574999988079071,
1054
+ "rewards/chosen": -1.1575034856796265,
1055
+ "rewards/margins": 0.18804897367954254,
1056
+ "rewards/rejected": -1.3455523252487183,
1057
+ "step": 670
1058
+ },
1059
+ {
1060
+ "epoch": 0.54,
1061
+ "learning_rate": 2.548909576501096e-06,
1062
+ "logits/chosen": -4.150703430175781,
1063
+ "logits/rejected": -4.143389701843262,
1064
+ "logps/chosen": -413.3582458496094,
1065
+ "logps/rejected": -443.1758728027344,
1066
+ "loss": 0.6353,
1067
+ "rewards/accuracies": 0.581250011920929,
1068
+ "rewards/chosen": -1.2182948589324951,
1069
+ "rewards/margins": 0.19429844617843628,
1070
+ "rewards/rejected": -1.4125932455062866,
1071
+ "step": 680
1072
+ },
1073
+ {
1074
+ "epoch": 0.55,
1075
+ "learning_rate": 2.4790376612091503e-06,
1076
+ "logits/chosen": -4.271695613861084,
1077
+ "logits/rejected": -4.229399681091309,
1078
+ "logps/chosen": -443.7752990722656,
1079
+ "logps/rejected": -475.551025390625,
1080
+ "loss": 0.6236,
1081
+ "rewards/accuracies": 0.6812499761581421,
1082
+ "rewards/chosen": -1.456383228302002,
1083
+ "rewards/margins": 0.23586714267730713,
1084
+ "rewards/rejected": -1.6922504901885986,
1085
+ "step": 690
1086
+ },
1087
+ {
1088
+ "epoch": 0.56,
1089
+ "learning_rate": 2.40918212081453e-06,
1090
+ "logits/chosen": -4.358768939971924,
1091
+ "logits/rejected": -4.3066534996032715,
1092
+ "logps/chosen": -407.3147888183594,
1093
+ "logps/rejected": -466.919921875,
1094
+ "loss": 0.5743,
1095
+ "rewards/accuracies": 0.6937500238418579,
1096
+ "rewards/chosen": -1.3695669174194336,
1097
+ "rewards/margins": 0.4060749411582947,
1098
+ "rewards/rejected": -1.7756417989730835,
1099
+ "step": 700
1100
+ },
1101
+ {
1102
+ "epoch": 0.56,
1103
+ "eval_logits/chosen": -4.518208026885986,
1104
+ "eval_logits/rejected": -4.504719257354736,
1105
+ "eval_logps/chosen": -375.0574645996094,
1106
+ "eval_logps/rejected": -392.7273254394531,
1107
+ "eval_loss": 0.6835331916809082,
1108
+ "eval_rewards/accuracies": 0.5630000233650208,
1109
+ "eval_rewards/chosen": -1.5434508323669434,
1110
+ "eval_rewards/margins": 0.12053229659795761,
1111
+ "eval_rewards/rejected": -1.6639831066131592,
1112
+ "eval_runtime": 1619.3262,
1113
+ "eval_samples_per_second": 1.849,
1114
+ "eval_steps_per_second": 0.232,
1115
+ "step": 700
1116
+ },
1117
+ {
1118
+ "epoch": 0.57,
1119
+ "learning_rate": 2.3393975235270654e-06,
1120
+ "logits/chosen": -4.303341865539551,
1121
+ "logits/rejected": -4.286491394042969,
1122
+ "logps/chosen": -452.05718994140625,
1123
+ "logps/rejected": -493.17144775390625,
1124
+ "loss": 0.6602,
1125
+ "rewards/accuracies": 0.5874999761581421,
1126
+ "rewards/chosen": -1.5600488185882568,
1127
+ "rewards/margins": 0.23919770121574402,
1128
+ "rewards/rejected": -1.7992465496063232,
1129
+ "step": 710
1130
+ },
1131
+ {
1132
+ "epoch": 0.58,
1133
+ "learning_rate": 2.2697383821388153e-06,
1134
+ "logits/chosen": -4.293368816375732,
1135
+ "logits/rejected": -4.3109025955200195,
1136
+ "logps/chosen": -435.1492614746094,
1137
+ "logps/rejected": -460.28033447265625,
1138
+ "loss": 0.6504,
1139
+ "rewards/accuracies": 0.5874999761581421,
1140
+ "rewards/chosen": -1.4816340208053589,
1141
+ "rewards/margins": 0.2186344861984253,
1142
+ "rewards/rejected": -1.7002685070037842,
1143
+ "step": 720
1144
+ },
1145
+ {
1146
+ "epoch": 0.58,
1147
+ "learning_rate": 2.2002591114409657e-06,
1148
+ "logits/chosen": -4.212637424468994,
1149
+ "logits/rejected": -4.208783149719238,
1150
+ "logps/chosen": -432.06805419921875,
1151
+ "logps/rejected": -468.85601806640625,
1152
+ "loss": 0.6325,
1153
+ "rewards/accuracies": 0.637499988079071,
1154
+ "rewards/chosen": -1.403395652770996,
1155
+ "rewards/margins": 0.24927303194999695,
1156
+ "rewards/rejected": -1.6526685953140259,
1157
+ "step": 730
1158
+ },
1159
+ {
1160
+ "epoch": 0.59,
1161
+ "learning_rate": 2.131013985717285e-06,
1162
+ "logits/chosen": -4.271391868591309,
1163
+ "logits/rejected": -4.220091819763184,
1164
+ "logps/chosen": -442.1729431152344,
1165
+ "logps/rejected": -491.44384765625,
1166
+ "loss": 0.6362,
1167
+ "rewards/accuracies": 0.675000011920929,
1168
+ "rewards/chosen": -1.4394041299819946,
1169
+ "rewards/margins": 0.2614460587501526,
1170
+ "rewards/rejected": -1.7008501291275024,
1171
+ "step": 740
1172
+ },
1173
+ {
1174
+ "epoch": 0.6,
1175
+ "learning_rate": 2.062057096347338e-06,
1176
+ "logits/chosen": -4.25800895690918,
1177
+ "logits/rejected": -4.223499774932861,
1178
+ "logps/chosen": -419.89495849609375,
1179
+ "logps/rejected": -435.30389404296875,
1180
+ "loss": 0.6593,
1181
+ "rewards/accuracies": 0.643750011920929,
1182
+ "rewards/chosen": -1.2841438055038452,
1183
+ "rewards/margins": 0.15097984671592712,
1184
+ "rewards/rejected": -1.4351234436035156,
1185
+ "step": 750
1186
+ },
1187
+ {
1188
+ "epoch": 0.61,
1189
+ "learning_rate": 1.9934423095525733e-06,
1190
+ "logits/chosen": -4.121432304382324,
1191
+ "logits/rejected": -4.1321306228637695,
1192
+ "logps/chosen": -416.6729431152344,
1193
+ "logps/rejected": -442.0414123535156,
1194
+ "loss": 0.6534,
1195
+ "rewards/accuracies": 0.6000000238418579,
1196
+ "rewards/chosen": -1.1921896934509277,
1197
+ "rewards/margins": 0.2546849846839905,
1198
+ "rewards/rejected": -1.4468748569488525,
1199
+ "step": 760
1200
+ },
1201
+ {
1202
+ "epoch": 0.62,
1203
+ "learning_rate": 1.9252232243182986e-06,
1204
+ "logits/chosen": -4.221813678741455,
1205
+ "logits/rejected": -4.169572830200195,
1206
+ "logps/chosen": -377.6047668457031,
1207
+ "logps/rejected": -425.6900939941406,
1208
+ "loss": 0.6026,
1209
+ "rewards/accuracies": 0.6875,
1210
+ "rewards/chosen": -1.071094036102295,
1211
+ "rewards/margins": 0.307754784822464,
1212
+ "rewards/rejected": -1.3788487911224365,
1213
+ "step": 770
1214
+ },
1215
+ {
1216
+ "epoch": 0.62,
1217
+ "learning_rate": 1.8574531305244043e-06,
1218
+ "logits/chosen": -4.010577201843262,
1219
+ "logits/rejected": -3.9968719482421875,
1220
+ "logps/chosen": -418.9974060058594,
1221
+ "logps/rejected": -465.41650390625,
1222
+ "loss": 0.5956,
1223
+ "rewards/accuracies": 0.6875,
1224
+ "rewards/chosen": -1.2276278734207153,
1225
+ "rewards/margins": 0.3166094124317169,
1226
+ "rewards/rejected": -1.5442373752593994,
1227
+ "step": 780
1228
+ },
1229
+ {
1230
+ "epoch": 0.63,
1231
+ "learning_rate": 1.7901849673175559e-06,
1232
+ "logits/chosen": -4.077489376068115,
1233
+ "logits/rejected": -4.033568382263184,
1234
+ "logps/chosen": -420.52984619140625,
1235
+ "logps/rejected": -455.3899841308594,
1236
+ "loss": 0.6434,
1237
+ "rewards/accuracies": 0.6187499761581421,
1238
+ "rewards/chosen": -1.2396165132522583,
1239
+ "rewards/margins": 0.20500020682811737,
1240
+ "rewards/rejected": -1.4446165561676025,
1241
+ "step": 790
1242
+ },
1243
+ {
1244
+ "epoch": 0.64,
1245
+ "learning_rate": 1.7234712817573555e-06,
1246
+ "logits/chosen": -4.063477516174316,
1247
+ "logits/rejected": -4.062304496765137,
1248
+ "logps/chosen": -456.8814392089844,
1249
+ "logps/rejected": -480.8173828125,
1250
+ "loss": 0.6443,
1251
+ "rewards/accuracies": 0.65625,
1252
+ "rewards/chosen": -1.4422929286956787,
1253
+ "rewards/margins": 0.22365431487560272,
1254
+ "rewards/rejected": -1.6659473180770874,
1255
+ "step": 800
1256
+ },
1257
+ {
1258
+ "epoch": 0.64,
1259
+ "eval_logits/chosen": -4.257167816162109,
1260
+ "eval_logits/rejected": -4.245348930358887,
1261
+ "eval_logps/chosen": -359.3107604980469,
1262
+ "eval_logps/rejected": -377.020751953125,
1263
+ "eval_loss": 0.6778839230537415,
1264
+ "eval_rewards/accuracies": 0.5666666626930237,
1265
+ "eval_rewards/chosen": -1.385983943939209,
1266
+ "eval_rewards/margins": 0.12093351036310196,
1267
+ "eval_rewards/rejected": -1.5069174766540527,
1268
+ "eval_runtime": 1620.4754,
1269
+ "eval_samples_per_second": 1.848,
1270
+ "eval_steps_per_second": 0.231,
1271
+ "step": 800
1272
+ },
1273
+ {
1274
+ "epoch": 0.65,
1275
+ "learning_rate": 1.6573641877687936e-06,
1276
+ "logits/chosen": -4.076521873474121,
1277
+ "logits/rejected": -4.057218074798584,
1278
+ "logps/chosen": -422.85699462890625,
1279
+ "logps/rejected": -470.95465087890625,
1280
+ "loss": 0.6161,
1281
+ "rewards/accuracies": 0.6499999761581421,
1282
+ "rewards/chosen": -1.3746501207351685,
1283
+ "rewards/margins": 0.2610599100589752,
1284
+ "rewards/rejected": -1.6357100009918213,
1285
+ "step": 810
1286
+ },
1287
+ {
1288
+ "epoch": 0.66,
1289
+ "learning_rate": 1.591915325433034e-06,
1290
+ "logits/chosen": -4.133788108825684,
1291
+ "logits/rejected": -4.142486572265625,
1292
+ "logps/chosen": -414.0416564941406,
1293
+ "logps/rejected": -449.82373046875,
1294
+ "loss": 0.6194,
1295
+ "rewards/accuracies": 0.6499999761581421,
1296
+ "rewards/chosen": -1.315294861793518,
1297
+ "rewards/margins": 0.30571961402893066,
1298
+ "rewards/rejected": -1.6210145950317383,
1299
+ "step": 820
1300
+ },
1301
+ {
1302
+ "epoch": 0.66,
1303
+ "learning_rate": 1.5271758206483664e-06,
1304
+ "logits/chosen": -4.143270015716553,
1305
+ "logits/rejected": -4.132315158843994,
1306
+ "logps/chosen": -438.13037109375,
1307
+ "logps/rejected": -471.50030517578125,
1308
+ "loss": 0.6481,
1309
+ "rewards/accuracies": 0.6937500238418579,
1310
+ "rewards/chosen": -1.4561512470245361,
1311
+ "rewards/margins": 0.22672787308692932,
1312
+ "rewards/rejected": -1.682879090309143,
1313
+ "step": 830
1314
+ },
1315
+ {
1316
+ "epoch": 0.67,
1317
+ "learning_rate": 1.4631962451927966e-06,
1318
+ "logits/chosen": -4.0487775802612305,
1319
+ "logits/rejected": -4.032698631286621,
1320
+ "logps/chosen": -431.3470153808594,
1321
+ "logps/rejected": -473.2076110839844,
1322
+ "loss": 0.6076,
1323
+ "rewards/accuracies": 0.6812499761581421,
1324
+ "rewards/chosen": -1.3846697807312012,
1325
+ "rewards/margins": 0.27571359276771545,
1326
+ "rewards/rejected": -1.6603834629058838,
1327
+ "step": 840
1328
+ },
1329
+ {
1330
+ "epoch": 0.68,
1331
+ "learning_rate": 1.4000265772195032e-06,
1332
+ "logits/chosen": -4.225982666015625,
1333
+ "logits/rejected": -4.171608924865723,
1334
+ "logps/chosen": -430.31201171875,
1335
+ "logps/rejected": -475.3255920410156,
1336
+ "loss": 0.6196,
1337
+ "rewards/accuracies": 0.6625000238418579,
1338
+ "rewards/chosen": -1.3736594915390015,
1339
+ "rewards/margins": 0.2828761339187622,
1340
+ "rewards/rejected": -1.6565355062484741,
1341
+ "step": 850
1342
+ },
1343
+ {
1344
+ "epoch": 0.69,
1345
+ "learning_rate": 1.3377161622160077e-06,
1346
+ "logits/chosen": -4.169137001037598,
1347
+ "logits/rejected": -4.160987854003906,
1348
+ "logps/chosen": -430.1625061035156,
1349
+ "logps/rejected": -470.7685546875,
1350
+ "loss": 0.6046,
1351
+ "rewards/accuracies": 0.7250000238418579,
1352
+ "rewards/chosen": -1.3943125009536743,
1353
+ "rewards/margins": 0.2797546982765198,
1354
+ "rewards/rejected": -1.6740672588348389,
1355
+ "step": 860
1356
+ },
1357
+ {
1358
+ "epoch": 0.7,
1359
+ "learning_rate": 1.276313674457553e-06,
1360
+ "logits/chosen": -4.306554794311523,
1361
+ "logits/rejected": -4.296151161193848,
1362
+ "logps/chosen": -415.35101318359375,
1363
+ "logps/rejected": -470.18145751953125,
1364
+ "loss": 0.5782,
1365
+ "rewards/accuracies": 0.71875,
1366
+ "rewards/chosen": -1.413987636566162,
1367
+ "rewards/margins": 0.35374554991722107,
1368
+ "rewards/rejected": -1.7677332162857056,
1369
+ "step": 870
1370
+ },
1371
+ {
1372
+ "epoch": 0.7,
1373
+ "learning_rate": 1.2158670789848095e-06,
1374
+ "logits/chosen": -4.3886284828186035,
1375
+ "logits/rejected": -4.385241508483887,
1376
+ "logps/chosen": -460.49005126953125,
1377
+ "logps/rejected": -502.52130126953125,
1378
+ "loss": 0.608,
1379
+ "rewards/accuracies": 0.675000011920929,
1380
+ "rewards/chosen": -1.6725581884384155,
1381
+ "rewards/margins": 0.34351325035095215,
1382
+ "rewards/rejected": -2.016071319580078,
1383
+ "step": 880
1384
+ },
1385
+ {
1386
+ "epoch": 0.71,
1387
+ "learning_rate": 1.1564235941356016e-06,
1388
+ "logits/chosen": -4.544154167175293,
1389
+ "logits/rejected": -4.480313301086426,
1390
+ "logps/chosen": -452.08245849609375,
1391
+ "logps/rejected": -509.763427734375,
1392
+ "loss": 0.6428,
1393
+ "rewards/accuracies": 0.5874999761581421,
1394
+ "rewards/chosen": -1.7576968669891357,
1395
+ "rewards/margins": 0.3635411858558655,
1396
+ "rewards/rejected": -2.1212382316589355,
1397
+ "step": 890
1398
+ },
1399
+ {
1400
+ "epoch": 0.72,
1401
+ "learning_rate": 1.0980296546599254e-06,
1402
+ "logits/chosen": -4.359221935272217,
1403
+ "logits/rejected": -4.357415199279785,
1404
+ "logps/chosen": -476.0204162597656,
1405
+ "logps/rejected": -513.69287109375,
1406
+ "loss": 0.6651,
1407
+ "rewards/accuracies": 0.6000000238418579,
1408
+ "rewards/chosen": -1.769547462463379,
1409
+ "rewards/margins": 0.4242987036705017,
1410
+ "rewards/rejected": -2.1938462257385254,
1411
+ "step": 900
1412
+ },
1413
+ {
1414
+ "epoch": 0.72,
1415
+ "eval_logits/chosen": -4.617808818817139,
1416
+ "eval_logits/rejected": -4.603901386260986,
1417
+ "eval_logps/chosen": -387.0414123535156,
1418
+ "eval_logps/rejected": -406.733154296875,
1419
+ "eval_loss": 0.6818779706954956,
1420
+ "eval_rewards/accuracies": 0.5693333148956299,
1421
+ "eval_rewards/chosen": -1.6632905006408691,
1422
+ "eval_rewards/margins": 0.14075076580047607,
1423
+ "eval_rewards/rejected": -1.8040413856506348,
1424
+ "eval_runtime": 1619.8038,
1425
+ "eval_samples_per_second": 1.848,
1426
+ "eval_steps_per_second": 0.232,
1427
+ "step": 900
1428
+ },
1429
+ {
1430
+ "epoch": 0.73,
1431
+ "learning_rate": 1.040730875447083e-06,
1432
+ "logits/chosen": -4.3218793869018555,
1433
+ "logits/rejected": -4.3270487785339355,
1434
+ "logps/chosen": -451.809814453125,
1435
+ "logps/rejected": -478.60491943359375,
1436
+ "loss": 0.6503,
1437
+ "rewards/accuracies": 0.668749988079071,
1438
+ "rewards/chosen": -1.565606713294983,
1439
+ "rewards/margins": 0.20372018218040466,
1440
+ "rewards/rejected": -1.7693268060684204,
1441
+ "step": 910
1442
+ },
1443
+ {
1444
+ "epoch": 0.74,
1445
+ "learning_rate": 9.845720158932414e-07,
1446
+ "logits/chosen": -4.289103031158447,
1447
+ "logits/rejected": -4.304252624511719,
1448
+ "logps/chosen": -420.05926513671875,
1449
+ "logps/rejected": -442.1366271972656,
1450
+ "loss": 0.6505,
1451
+ "rewards/accuracies": 0.625,
1452
+ "rewards/chosen": -1.4516870975494385,
1453
+ "rewards/margins": 0.20668797194957733,
1454
+ "rewards/rejected": -1.6583747863769531,
1455
+ "step": 920
1456
+ },
1457
+ {
1458
+ "epoch": 0.74,
1459
+ "learning_rate": 9.295969449372796e-07,
1460
+ "logits/chosen": -4.317067623138428,
1461
+ "logits/rejected": -4.289021968841553,
1462
+ "logps/chosen": -417.4593811035156,
1463
+ "logps/rejected": -454.826904296875,
1464
+ "loss": 0.6219,
1465
+ "rewards/accuracies": 0.6625000238418579,
1466
+ "rewards/chosen": -1.3394687175750732,
1467
+ "rewards/margins": 0.25516074895858765,
1468
+ "rewards/rejected": -1.5946292877197266,
1469
+ "step": 930
1470
+ },
1471
+ {
1472
+ "epoch": 0.75,
1473
+ "learning_rate": 8.758486067922176e-07,
1474
+ "logits/chosen": -4.306538105010986,
1475
+ "logits/rejected": -4.262487411499023,
1476
+ "logps/chosen": -414.806884765625,
1477
+ "logps/rejected": -465.29931640625,
1478
+ "loss": 0.5998,
1479
+ "rewards/accuracies": 0.699999988079071,
1480
+ "rewards/chosen": -1.3654025793075562,
1481
+ "rewards/margins": 0.3171504735946655,
1482
+ "rewards/rejected": -1.6825529336929321,
1483
+ "step": 940
1484
+ },
1485
+ {
1486
+ "epoch": 0.76,
1487
+ "learning_rate": 8.233689873990006e-07,
1488
+ "logits/chosen": -4.317531585693359,
1489
+ "logits/rejected": -4.282795429229736,
1490
+ "logps/chosen": -420.382568359375,
1491
+ "logps/rejected": -493.355712890625,
1492
+ "loss": 0.5953,
1493
+ "rewards/accuracies": 0.6312500238418579,
1494
+ "rewards/chosen": -1.419424295425415,
1495
+ "rewards/margins": 0.5507811903953552,
1496
+ "rewards/rejected": -1.970205545425415,
1497
+ "step": 950
1498
+ },
1499
+ {
1500
+ "epoch": 0.77,
1501
+ "learning_rate": 7.721990816288555e-07,
1502
+ "logits/chosen": -4.283775329589844,
1503
+ "logits/rejected": -4.2459330558776855,
1504
+ "logps/chosen": -395.4678955078125,
1505
+ "logps/rejected": -434.7349548339844,
1506
+ "loss": 0.6193,
1507
+ "rewards/accuracies": 0.6875,
1508
+ "rewards/chosen": -1.3240183591842651,
1509
+ "rewards/margins": 0.2636045813560486,
1510
+ "rewards/rejected": -1.587622880935669,
1511
+ "step": 960
1512
+ },
1513
+ {
1514
+ "epoch": 0.78,
1515
+ "learning_rate": 7.223788612598148e-07,
1516
+ "logits/chosen": -4.310162544250488,
1517
+ "logits/rejected": -4.287927627563477,
1518
+ "logps/chosen": -436.2978515625,
1519
+ "logps/rejected": -470.63079833984375,
1520
+ "loss": 0.6316,
1521
+ "rewards/accuracies": 0.643750011920929,
1522
+ "rewards/chosen": -1.4355452060699463,
1523
+ "rewards/margins": 0.18634586036205292,
1524
+ "rewards/rejected": -1.6218910217285156,
1525
+ "step": 970
1526
+ },
1527
+ {
1528
+ "epoch": 0.78,
1529
+ "learning_rate": 6.73947243752448e-07,
1530
+ "logits/chosen": -4.377969264984131,
1531
+ "logits/rejected": -4.389736652374268,
1532
+ "logps/chosen": -458.318359375,
1533
+ "logps/rejected": -487.77874755859375,
1534
+ "loss": 0.6632,
1535
+ "rewards/accuracies": 0.612500011920929,
1536
+ "rewards/chosen": -1.6293509006500244,
1537
+ "rewards/margins": 0.36782675981521606,
1538
+ "rewards/rejected": -1.9971777200698853,
1539
+ "step": 980
1540
+ },
1541
+ {
1542
+ "epoch": 0.79,
1543
+ "learning_rate": 6.269420618491759e-07,
1544
+ "logits/chosen": -4.321467876434326,
1545
+ "logits/rejected": -4.286978721618652,
1546
+ "logps/chosen": -420.07220458984375,
1547
+ "logps/rejected": -458.8563537597656,
1548
+ "loss": 0.6383,
1549
+ "rewards/accuracies": 0.625,
1550
+ "rewards/chosen": -1.4905959367752075,
1551
+ "rewards/margins": 0.22982291877269745,
1552
+ "rewards/rejected": -1.720418930053711,
1553
+ "step": 990
1554
+ },
1555
+ {
1556
+ "epoch": 0.8,
1557
+ "learning_rate": 5.814000340209267e-07,
1558
+ "logits/chosen": -4.299461364746094,
1559
+ "logits/rejected": -4.237879276275635,
1560
+ "logps/chosen": -435.0042419433594,
1561
+ "logps/rejected": -491.1070861816406,
1562
+ "loss": 0.5993,
1563
+ "rewards/accuracies": 0.7250000238418579,
1564
+ "rewards/chosen": -1.4882529973983765,
1565
+ "rewards/margins": 0.3519567549228668,
1566
+ "rewards/rejected": -1.8402099609375,
1567
+ "step": 1000
1568
+ },
1569
+ {
1570
+ "epoch": 0.8,
1571
+ "eval_logits/chosen": -4.549124717712402,
1572
+ "eval_logits/rejected": -4.535641193389893,
1573
+ "eval_logps/chosen": -378.4713439941406,
1574
+ "eval_logps/rejected": -398.2364196777344,
1575
+ "eval_loss": 0.6785325407981873,
1576
+ "eval_rewards/accuracies": 0.5683333277702332,
1577
+ "eval_rewards/chosen": -1.577589511871338,
1578
+ "eval_rewards/margins": 0.1414840668439865,
1579
+ "eval_rewards/rejected": -1.7190735340118408,
1580
+ "eval_runtime": 1620.6034,
1581
+ "eval_samples_per_second": 1.847,
1582
+ "eval_steps_per_second": 0.231,
1583
+ "step": 1000
1584
+ },
1585
+ {
1586
+ "epoch": 0.81,
1587
+ "learning_rate": 5.373567357842111e-07,
1588
+ "logits/chosen": -4.278590202331543,
1589
+ "logits/rejected": -4.246352195739746,
1590
+ "logps/chosen": -429.3436584472656,
1591
+ "logps/rejected": -471.84722900390625,
1592
+ "loss": 0.5957,
1593
+ "rewards/accuracies": 0.643750011920929,
1594
+ "rewards/chosen": -1.422374963760376,
1595
+ "rewards/margins": 0.33030739426612854,
1596
+ "rewards/rejected": -1.7526824474334717,
1597
+ "step": 1010
1598
+ },
1599
+ {
1600
+ "epoch": 0.82,
1601
+ "learning_rate": 4.948465719110226e-07,
1602
+ "logits/chosen": -4.373248100280762,
1603
+ "logits/rejected": -4.3430304527282715,
1604
+ "logps/chosen": -415.3047790527344,
1605
+ "logps/rejected": -445.019287109375,
1606
+ "loss": 0.6443,
1607
+ "rewards/accuracies": 0.625,
1608
+ "rewards/chosen": -1.40928053855896,
1609
+ "rewards/margins": 0.228702574968338,
1610
+ "rewards/rejected": -1.6379830837249756,
1611
+ "step": 1020
1612
+ },
1613
+ {
1614
+ "epoch": 0.82,
1615
+ "learning_rate": 4.539027495532766e-07,
1616
+ "logits/chosen": -4.33120059967041,
1617
+ "logits/rejected": -4.347836494445801,
1618
+ "logps/chosen": -415.25555419921875,
1619
+ "logps/rejected": -453.518798828125,
1620
+ "loss": 0.6177,
1621
+ "rewards/accuracies": 0.65625,
1622
+ "rewards/chosen": -1.4035775661468506,
1623
+ "rewards/margins": 0.28108319640159607,
1624
+ "rewards/rejected": -1.6846606731414795,
1625
+ "step": 1030
1626
+ },
1627
+ {
1628
+ "epoch": 0.83,
1629
+ "learning_rate": 4.14557252302783e-07,
1630
+ "logits/chosen": -4.341530799865723,
1631
+ "logits/rejected": -4.305572032928467,
1632
+ "logps/chosen": -433.0341796875,
1633
+ "logps/rejected": -472.1036682128906,
1634
+ "loss": 0.6713,
1635
+ "rewards/accuracies": 0.637499988079071,
1636
+ "rewards/chosen": -1.5271722078323364,
1637
+ "rewards/margins": 0.23339995741844177,
1638
+ "rewards/rejected": -1.760572075843811,
1639
+ "step": 1040
1640
+ },
1641
+ {
1642
+ "epoch": 0.84,
1643
+ "learning_rate": 3.7684081520700884e-07,
1644
+ "logits/chosen": -4.2357988357543945,
1645
+ "logits/rejected": -4.237625598907471,
1646
+ "logps/chosen": -450.54827880859375,
1647
+ "logps/rejected": -471.02423095703125,
1648
+ "loss": 0.6429,
1649
+ "rewards/accuracies": 0.643750011920929,
1650
+ "rewards/chosen": -1.5025393962860107,
1651
+ "rewards/margins": 0.2623196542263031,
1652
+ "rewards/rejected": -1.7648589611053467,
1653
+ "step": 1050
1654
+ },
1655
+ {
1656
+ "epoch": 0.85,
1657
+ "learning_rate": 3.407829007601507e-07,
1658
+ "logits/chosen": -4.270508766174316,
1659
+ "logits/rejected": -4.225382328033447,
1660
+ "logps/chosen": -428.3861389160156,
1661
+ "logps/rejected": -479.0707092285156,
1662
+ "loss": 0.6119,
1663
+ "rewards/accuracies": 0.668749988079071,
1664
+ "rewards/chosen": -1.4203639030456543,
1665
+ "rewards/margins": 0.3360464870929718,
1666
+ "rewards/rejected": -1.7564103603363037,
1667
+ "step": 1060
1668
+ },
1669
+ {
1670
+ "epoch": 0.86,
1671
+ "learning_rate": 3.064116758882724e-07,
1672
+ "logits/chosen": -4.24053955078125,
1673
+ "logits/rejected": -4.1828999519348145,
1674
+ "logps/chosen": -443.78485107421875,
1675
+ "logps/rejected": -504.28070068359375,
1676
+ "loss": 0.6037,
1677
+ "rewards/accuracies": 0.6875,
1678
+ "rewards/chosen": -1.5132863521575928,
1679
+ "rewards/margins": 0.4130435883998871,
1680
+ "rewards/rejected": -1.9263302087783813,
1681
+ "step": 1070
1682
+ },
1683
+ {
1684
+ "epoch": 0.86,
1685
+ "learning_rate": 2.737539899464908e-07,
1686
+ "logits/chosen": -4.2971696853637695,
1687
+ "logits/rejected": -4.288466453552246,
1688
+ "logps/chosen": -403.1819763183594,
1689
+ "logps/rejected": -454.8402404785156,
1690
+ "loss": 0.6141,
1691
+ "rewards/accuracies": 0.643750011920929,
1692
+ "rewards/chosen": -1.3817451000213623,
1693
+ "rewards/margins": 0.379900187253952,
1694
+ "rewards/rejected": -1.7616455554962158,
1695
+ "step": 1080
1696
+ },
1697
+ {
1698
+ "epoch": 0.87,
1699
+ "learning_rate": 2.4283535374538645e-07,
1700
+ "logits/chosen": -4.242150783538818,
1701
+ "logits/rejected": -4.240577220916748,
1702
+ "logps/chosen": -434.7925720214844,
1703
+ "logps/rejected": -473.67803955078125,
1704
+ "loss": 0.6122,
1705
+ "rewards/accuracies": 0.637499988079071,
1706
+ "rewards/chosen": -1.4003090858459473,
1707
+ "rewards/margins": 0.31833842396736145,
1708
+ "rewards/rejected": -1.7186473608016968,
1709
+ "step": 1090
1710
+ },
1711
+ {
1712
+ "epoch": 0.88,
1713
+ "learning_rate": 2.1367991962303298e-07,
1714
+ "logits/chosen": -4.232297420501709,
1715
+ "logits/rejected": -4.2173943519592285,
1716
+ "logps/chosen": -418.9490661621094,
1717
+ "logps/rejected": -446.4789123535156,
1718
+ "loss": 0.6759,
1719
+ "rewards/accuracies": 0.5874999761581421,
1720
+ "rewards/chosen": -1.5112884044647217,
1721
+ "rewards/margins": 0.1443391889333725,
1722
+ "rewards/rejected": -1.6556276082992554,
1723
+ "step": 1100
1724
+ },
1725
+ {
1726
+ "epoch": 0.88,
1727
+ "eval_logits/chosen": -4.485485553741455,
1728
+ "eval_logits/rejected": -4.472198009490967,
1729
+ "eval_logps/chosen": -375.86541748046875,
1730
+ "eval_logps/rejected": -395.56036376953125,
1731
+ "eval_loss": 0.6777821779251099,
1732
+ "eval_rewards/accuracies": 0.5686666369438171,
1733
+ "eval_rewards/chosen": -1.5515305995941162,
1734
+ "eval_rewards/margins": 0.14078289270401,
1735
+ "eval_rewards/rejected": -1.6923134326934814,
1736
+ "eval_runtime": 1618.2899,
1737
+ "eval_samples_per_second": 1.85,
1738
+ "eval_steps_per_second": 0.232,
1739
+ "step": 1100
1740
+ },
1741
+ {
1742
+ "epoch": 0.89,
1743
+ "learning_rate": 1.8631046257820278e-07,
1744
+ "logits/chosen": -4.233702659606934,
1745
+ "logits/rejected": -4.249814033508301,
1746
+ "logps/chosen": -432.20458984375,
1747
+ "logps/rejected": -461.7394104003906,
1748
+ "loss": 0.6275,
1749
+ "rewards/accuracies": 0.6499999761581421,
1750
+ "rewards/chosen": -1.41855788230896,
1751
+ "rewards/margins": 0.2600322365760803,
1752
+ "rewards/rejected": -1.678590178489685,
1753
+ "step": 1110
1754
+ },
1755
+ {
1756
+ "epoch": 0.9,
1757
+ "learning_rate": 1.6074836247950143e-07,
1758
+ "logits/chosen": -4.206725597381592,
1759
+ "logits/rejected": -4.209759712219238,
1760
+ "logps/chosen": -443.52459716796875,
1761
+ "logps/rejected": -470.96588134765625,
1762
+ "loss": 0.6415,
1763
+ "rewards/accuracies": 0.6312500238418579,
1764
+ "rewards/chosen": -1.4318974018096924,
1765
+ "rewards/margins": 0.21563585102558136,
1766
+ "rewards/rejected": -1.6475334167480469,
1767
+ "step": 1120
1768
+ },
1769
+ {
1770
+ "epoch": 0.9,
1771
+ "learning_rate": 1.370135873643097e-07,
1772
+ "logits/chosen": -4.226916313171387,
1773
+ "logits/rejected": -4.266509532928467,
1774
+ "logps/chosen": -436.6336975097656,
1775
+ "logps/rejected": -456.3214416503906,
1776
+ "loss": 0.6262,
1777
+ "rewards/accuracies": 0.6499999761581421,
1778
+ "rewards/chosen": -1.4590485095977783,
1779
+ "rewards/margins": 0.2844700217247009,
1780
+ "rewards/rejected": -1.7435184717178345,
1781
+ "step": 1130
1782
+ },
1783
+ {
1784
+ "epoch": 0.91,
1785
+ "learning_rate": 1.1512467784059372e-07,
1786
+ "logits/chosen": -4.302299499511719,
1787
+ "logits/rejected": -4.261002540588379,
1788
+ "logps/chosen": -398.5301208496094,
1789
+ "logps/rejected": -437.83673095703125,
1790
+ "loss": 0.6221,
1791
+ "rewards/accuracies": 0.6312500238418579,
1792
+ "rewards/chosen": -1.3698493242263794,
1793
+ "rewards/margins": 0.2959776520729065,
1794
+ "rewards/rejected": -1.6658270359039307,
1795
+ "step": 1140
1796
+ },
1797
+ {
1798
+ "epoch": 0.92,
1799
+ "learning_rate": 9.509873260376251e-08,
1800
+ "logits/chosen": -4.228875160217285,
1801
+ "logits/rejected": -4.17572021484375,
1802
+ "logps/chosen": -429.0230407714844,
1803
+ "logps/rejected": -501.6388244628906,
1804
+ "loss": 0.5814,
1805
+ "rewards/accuracies": 0.699999988079071,
1806
+ "rewards/chosen": -1.4344505071640015,
1807
+ "rewards/margins": 0.4450379014015198,
1808
+ "rewards/rejected": -1.8794885873794556,
1809
+ "step": 1150
1810
+ },
1811
+ {
1812
+ "epoch": 0.93,
1813
+ "learning_rate": 7.695139507988559e-08,
1814
+ "logits/chosen": -4.261081218719482,
1815
+ "logits/rejected": -4.282492160797119,
1816
+ "logps/chosen": -444.05096435546875,
1817
+ "logps/rejected": -479.0032653808594,
1818
+ "loss": 0.6376,
1819
+ "rewards/accuracies": 0.6000000238418579,
1820
+ "rewards/chosen": -1.457777500152588,
1821
+ "rewards/margins": 0.2464013397693634,
1822
+ "rewards/rejected": -1.704178810119629,
1823
+ "step": 1160
1824
+ },
1825
+ {
1826
+ "epoch": 0.94,
1827
+ "learning_rate": 6.069684120570684e-08,
1828
+ "logits/chosen": -4.286696434020996,
1829
+ "logits/rejected": -4.1916022300720215,
1830
+ "logps/chosen": -431.74053955078125,
1831
+ "logps/rejected": -482.0506896972656,
1832
+ "loss": 0.6355,
1833
+ "rewards/accuracies": 0.625,
1834
+ "rewards/chosen": -1.5193744897842407,
1835
+ "rewards/margins": 0.3015151619911194,
1836
+ "rewards/rejected": -1.8208894729614258,
1837
+ "step": 1170
1838
+ },
1839
+ {
1840
+ "epoch": 0.94,
1841
+ "learning_rate": 4.634776835499871e-08,
1842
+ "logits/chosen": -4.216092109680176,
1843
+ "logits/rejected": -4.185781955718994,
1844
+ "logps/chosen": -413.5948791503906,
1845
+ "logps/rejected": -455.0482482910156,
1846
+ "loss": 0.6371,
1847
+ "rewards/accuracies": 0.6625000238418579,
1848
+ "rewards/chosen": -1.4376914501190186,
1849
+ "rewards/margins": 0.27367502450942993,
1850
+ "rewards/rejected": -1.7113662958145142,
1851
+ "step": 1180
1852
+ },
1853
+ {
1854
+ "epoch": 0.95,
1855
+ "learning_rate": 3.3915385419908964e-08,
1856
+ "logits/chosen": -4.179436206817627,
1857
+ "logits/rejected": -4.222240447998047,
1858
+ "logps/chosen": -430.4244079589844,
1859
+ "logps/rejected": -464.51629638671875,
1860
+ "loss": 0.6297,
1861
+ "rewards/accuracies": 0.6000000238418579,
1862
+ "rewards/chosen": -1.4367833137512207,
1863
+ "rewards/margins": 0.29136672616004944,
1864
+ "rewards/rejected": -1.7281500101089478,
1865
+ "step": 1190
1866
+ },
1867
+ {
1868
+ "epoch": 0.96,
1869
+ "learning_rate": 2.3409404055043938e-08,
1870
+ "logits/chosen": -4.306519508361816,
1871
+ "logits/rejected": -4.272242546081543,
1872
+ "logps/chosen": -437.791259765625,
1873
+ "logps/rejected": -471.58575439453125,
1874
+ "loss": 0.6402,
1875
+ "rewards/accuracies": 0.5874999761581421,
1876
+ "rewards/chosen": -1.5211617946624756,
1877
+ "rewards/margins": 0.2325367033481598,
1878
+ "rewards/rejected": -1.7536985874176025,
1879
+ "step": 1200
1880
+ },
1881
+ {
1882
+ "epoch": 0.96,
1883
+ "eval_logits/chosen": -4.457742691040039,
1884
+ "eval_logits/rejected": -4.444427967071533,
1885
+ "eval_logps/chosen": -375.10284423828125,
1886
+ "eval_logps/rejected": -394.7001647949219,
1887
+ "eval_loss": 0.6772644519805908,
1888
+ "eval_rewards/accuracies": 0.5690000057220459,
1889
+ "eval_rewards/chosen": -1.5439047813415527,
1890
+ "eval_rewards/margins": 0.13980673253536224,
1891
+ "eval_rewards/rejected": -1.6837116479873657,
1892
+ "eval_runtime": 1624.6813,
1893
+ "eval_samples_per_second": 1.843,
1894
+ "eval_steps_per_second": 0.231,
1895
+ "step": 1200
1896
+ },
1897
+ {
1898
+ "epoch": 0.97,
1899
+ "learning_rate": 1.4838031091134186e-08,
1900
+ "logits/chosen": -4.243393898010254,
1901
+ "logits/rejected": -4.175347805023193,
1902
+ "logps/chosen": -404.5989990234375,
1903
+ "logps/rejected": -469.9576110839844,
1904
+ "loss": 0.5982,
1905
+ "rewards/accuracies": 0.6937500238418579,
1906
+ "rewards/chosen": -1.3969910144805908,
1907
+ "rewards/margins": 0.3869909644126892,
1908
+ "rewards/rejected": -1.7839819192886353,
1909
+ "step": 1210
1910
+ },
1911
+ {
1912
+ "epoch": 0.98,
1913
+ "learning_rate": 8.207962124201774e-09,
1914
+ "logits/chosen": -4.263760566711426,
1915
+ "logits/rejected": -4.225130081176758,
1916
+ "logps/chosen": -435.9114685058594,
1917
+ "logps/rejected": -472.66839599609375,
1918
+ "loss": 0.6017,
1919
+ "rewards/accuracies": 0.675000011920929,
1920
+ "rewards/chosen": -1.37319016456604,
1921
+ "rewards/margins": 0.3209651708602905,
1922
+ "rewards/rejected": -1.6941554546356201,
1923
+ "step": 1220
1924
+ },
1925
+ {
1926
+ "epoch": 0.98,
1927
+ "learning_rate": 3.5243762852441023e-09,
1928
+ "logits/chosen": -4.210858345031738,
1929
+ "logits/rejected": -4.163503170013428,
1930
+ "logps/chosen": -427.44342041015625,
1931
+ "logps/rejected": -469.5209045410156,
1932
+ "loss": 0.6376,
1933
+ "rewards/accuracies": 0.6000000238418579,
1934
+ "rewards/chosen": -1.4386152029037476,
1935
+ "rewards/margins": 0.2547362446784973,
1936
+ "rewards/rejected": -1.6933513879776,
1937
+ "step": 1230
1938
+ },
1939
+ {
1940
+ "epoch": 0.99,
1941
+ "learning_rate": 7.909321945129278e-10,
1942
+ "logits/chosen": -4.18636417388916,
1943
+ "logits/rejected": -4.141125679016113,
1944
+ "logps/chosen": -443.1625061035156,
1945
+ "logps/rejected": -493.43975830078125,
1946
+ "loss": 0.6016,
1947
+ "rewards/accuracies": 0.6499999761581421,
1948
+ "rewards/chosen": -1.4015429019927979,
1949
+ "rewards/margins": 0.35573890805244446,
1950
+ "rewards/rejected": -1.7572818994522095,
1951
+ "step": 1240
1952
+ },
1953
+ {
1954
+ "epoch": 1.0,
1955
+ "step": 1249,
1956
+ "total_flos": 0.0,
1957
+ "train_loss": 0.6475976420174225,
1958
+ "train_runtime": 42677.4758,
1959
+ "train_samples_per_second": 0.469,
1960
+ "train_steps_per_second": 0.029
1961
+ }
1962
+ ],
1963
+ "logging_steps": 10,
1964
+ "max_steps": 1249,
1965
+ "num_input_tokens_seen": 0,
1966
+ "num_train_epochs": 1,
1967
+ "save_steps": 100,
1968
+ "total_flos": 0.0,
1969
+ "train_batch_size": 4,
1970
+ "trial_name": null,
1971
+ "trial_params": null
1972
+ }