BraylonDash commited on
Commit
9237faa
1 Parent(s): 086f5b8

Model save

Browse files
README.md ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ base_model: microsoft/phi-2
9
+ model-index:
10
+ - name: phi-2-dpo-test-iter-0
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # phi-2-dpo-test-iter-0
18
+
19
+ This model is a fine-tuned version of [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) on the None dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.0002
22
+ - Rewards/chosen: -0.0029
23
+ - Rewards/rejected: -0.0032
24
+ - Rewards/accuracies: 0.5130
25
+ - Rewards/margins: 0.0003
26
+ - Logps/rejected: -233.8547
27
+ - Logps/chosen: -256.9005
28
+ - Logits/rejected: 0.8721
29
+ - Logits/chosen: 0.8145
30
+
31
+ ## Model description
32
+
33
+ More information needed
34
+
35
+ ## Intended uses & limitations
36
+
37
+ More information needed
38
+
39
+ ## Training and evaluation data
40
+
41
+ More information needed
42
+
43
+ ## Training procedure
44
+
45
+ ### Training hyperparameters
46
+
47
+ The following hyperparameters were used during training:
48
+ - learning_rate: 5e-06
49
+ - train_batch_size: 4
50
+ - eval_batch_size: 4
51
+ - seed: 42
52
+ - distributed_type: multi-GPU
53
+ - gradient_accumulation_steps: 4
54
+ - total_train_batch_size: 16
55
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
56
+ - lr_scheduler_type: cosine
57
+ - lr_scheduler_warmup_ratio: 0.1
58
+ - num_epochs: 4
59
+
60
+ ### Training results
61
+
62
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
63
+ |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
64
+ | 0.0001 | 0.32 | 100 | 0.0002 | -0.0012 | -0.0015 | 0.5200 | 0.0003 | -233.6874 | -256.7341 | 0.8840 | 0.8263 |
65
+ | 0.0001 | 0.64 | 200 | 0.0002 | -0.0021 | -0.0023 | 0.5005 | 0.0002 | -233.7691 | -256.8278 | 0.8778 | 0.8201 |
66
+ | 0.0001 | 0.96 | 300 | 0.0002 | -0.0021 | -0.0024 | 0.4985 | 0.0003 | -233.7780 | -256.8272 | 0.8783 | 0.8206 |
67
+ | 0.0001 | 1.28 | 400 | 0.0002 | -0.0026 | -0.0029 | 0.5195 | 0.0003 | -233.8277 | -256.8757 | 0.8769 | 0.8192 |
68
+ | 0.0001 | 1.6 | 500 | 0.0002 | -0.0027 | -0.0030 | 0.5170 | 0.0003 | -233.8388 | -256.8869 | 0.8729 | 0.8151 |
69
+ | 0.0001 | 1.92 | 600 | 0.0002 | -0.0027 | -0.0030 | 0.5070 | 0.0003 | -233.8414 | -256.8860 | 0.8757 | 0.8180 |
70
+ | 0.0001 | 2.24 | 700 | 0.0002 | -0.0030 | -0.0032 | 0.5065 | 0.0002 | -233.8592 | -256.9123 | 0.8719 | 0.8142 |
71
+ | 0.0001 | 2.56 | 800 | 0.0002 | -0.0028 | -0.0030 | 0.5190 | 0.0003 | -233.8422 | -256.8898 | 0.8713 | 0.8135 |
72
+ | 0.0001 | 2.88 | 900 | 0.0002 | -0.0030 | -0.0031 | 0.5015 | 0.0002 | -233.8529 | -256.9111 | 0.8714 | 0.8136 |
73
+ | 0.0001 | 3.2 | 1000 | 0.0002 | -0.0029 | -0.0033 | 0.5180 | 0.0004 | -233.8666 | -256.9036 | 0.8733 | 0.8156 |
74
+ | 0.0001 | 3.52 | 1100 | 0.0002 | -0.0029 | -0.0034 | 0.5265 | 0.0005 | -233.8779 | -256.9080 | 0.8724 | 0.8145 |
75
+ | 0.0001 | 3.84 | 1200 | 0.0002 | -0.0031 | -0.0033 | 0.5045 | 0.0003 | -233.8733 | -256.9227 | 0.8705 | 0.8127 |
76
+
77
+
78
+ ### Framework versions
79
+
80
+ - PEFT 0.7.1
81
+ - Transformers 4.36.2
82
+ - Pytorch 2.2.1+cu121
83
+ - Datasets 2.14.6
84
+ - Tokenizers 0.15.2
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:037da54606f90b7e0e295a6b6f87be936b95244aed4c47b258419b348fe26a60
3
  size 41977616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08f46b208abcd902f8b18e4179aea06a78b6e88e8af55a35beb190f9b3ea699c
3
  size 41977616
all_results.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.99,
3
+ "eval_logits/chosen": 0.81451815366745,
4
+ "eval_logits/rejected": 0.8721050024032593,
5
+ "eval_logps/chosen": -256.90045166015625,
6
+ "eval_logps/rejected": -233.8546905517578,
7
+ "eval_loss": 0.00017304150969721377,
8
+ "eval_rewards/accuracies": 0.5130000114440918,
9
+ "eval_rewards/chosen": -0.0028624406550079584,
10
+ "eval_rewards/margins": 0.0003016398404724896,
11
+ "eval_rewards/rejected": -0.003164080437272787,
12
+ "eval_runtime": 412.2426,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 4.852,
15
+ "eval_steps_per_second": 1.213,
16
+ "train_loss": 9.35627439654733e-05,
17
+ "train_runtime": 14053.8323,
18
+ "train_samples": 61135,
19
+ "train_samples_per_second": 1.423,
20
+ "train_steps_per_second": 0.089
21
+ }
eval_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.99,
3
+ "eval_logits/chosen": 0.81451815366745,
4
+ "eval_logits/rejected": 0.8721050024032593,
5
+ "eval_logps/chosen": -256.90045166015625,
6
+ "eval_logps/rejected": -233.8546905517578,
7
+ "eval_loss": 0.00017304150969721377,
8
+ "eval_rewards/accuracies": 0.5130000114440918,
9
+ "eval_rewards/chosen": -0.0028624406550079584,
10
+ "eval_rewards/margins": 0.0003016398404724896,
11
+ "eval_rewards/rejected": -0.003164080437272787,
12
+ "eval_runtime": 412.2426,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 4.852,
15
+ "eval_steps_per_second": 1.213
16
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.99,
3
+ "train_loss": 9.35627439654733e-05,
4
+ "train_runtime": 14053.8323,
5
+ "train_samples": 61135,
6
+ "train_samples_per_second": 1.423,
7
+ "train_steps_per_second": 0.089
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1972 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.9936,
5
+ "eval_steps": 100,
6
+ "global_step": 1248,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 4e-08,
14
+ "logits/chosen": 0.76749187707901,
15
+ "logits/rejected": 1.0001295804977417,
16
+ "logps/chosen": -205.27383422851562,
17
+ "logps/rejected": -130.56936645507812,
18
+ "loss": 0.0001,
19
+ "rewards/accuracies": 0.0,
20
+ "rewards/chosen": 0.0,
21
+ "rewards/margins": 0.0,
22
+ "rewards/rejected": 0.0,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 0.03,
27
+ "learning_rate": 4.0000000000000003e-07,
28
+ "logits/chosen": 0.7141001224517822,
29
+ "logits/rejected": 0.7724499106407166,
30
+ "logps/chosen": -190.74786376953125,
31
+ "logps/rejected": -138.00537109375,
32
+ "loss": 0.0001,
33
+ "rewards/accuracies": 0.4027777910232544,
34
+ "rewards/chosen": 0.0002450596366543323,
35
+ "rewards/margins": 0.0004758307186421007,
36
+ "rewards/rejected": -0.0002307710237801075,
37
+ "step": 10
38
+ },
39
+ {
40
+ "epoch": 0.06,
41
+ "learning_rate": 8.000000000000001e-07,
42
+ "logits/chosen": 0.601749062538147,
43
+ "logits/rejected": 0.879185676574707,
44
+ "logps/chosen": -173.01181030273438,
45
+ "logps/rejected": -116.30070495605469,
46
+ "loss": 0.0001,
47
+ "rewards/accuracies": 0.4749999940395355,
48
+ "rewards/chosen": 4.058447757415706e-06,
49
+ "rewards/margins": -0.00013263085565995425,
50
+ "rewards/rejected": 0.00013668931205756962,
51
+ "step": 20
52
+ },
53
+ {
54
+ "epoch": 0.1,
55
+ "learning_rate": 1.2000000000000002e-06,
56
+ "logits/chosen": 0.6547126173973083,
57
+ "logits/rejected": 0.833116352558136,
58
+ "logps/chosen": -187.99520874023438,
59
+ "logps/rejected": -127.53055572509766,
60
+ "loss": 0.0001,
61
+ "rewards/accuracies": 0.45625001192092896,
62
+ "rewards/chosen": 2.28220596909523e-06,
63
+ "rewards/margins": 0.00018625140364747494,
64
+ "rewards/rejected": -0.00018396916857454926,
65
+ "step": 30
66
+ },
67
+ {
68
+ "epoch": 0.13,
69
+ "learning_rate": 1.6000000000000001e-06,
70
+ "logits/chosen": 0.5628782510757446,
71
+ "logits/rejected": 0.745602548122406,
72
+ "logps/chosen": -183.92758178710938,
73
+ "logps/rejected": -134.64688110351562,
74
+ "loss": 0.0001,
75
+ "rewards/accuracies": 0.45625001192092896,
76
+ "rewards/chosen": -0.0005207593785598874,
77
+ "rewards/margins": -0.0005336635513231158,
78
+ "rewards/rejected": 1.290418458665954e-05,
79
+ "step": 40
80
+ },
81
+ {
82
+ "epoch": 0.16,
83
+ "learning_rate": 2.0000000000000003e-06,
84
+ "logits/chosen": 0.6242247819900513,
85
+ "logits/rejected": 0.8314679265022278,
86
+ "logps/chosen": -171.67556762695312,
87
+ "logps/rejected": -120.4903335571289,
88
+ "loss": 0.0001,
89
+ "rewards/accuracies": 0.5,
90
+ "rewards/chosen": 0.00020891983876936138,
91
+ "rewards/margins": 0.0005679951282218099,
92
+ "rewards/rejected": -0.00035907537676393986,
93
+ "step": 50
94
+ },
95
+ {
96
+ "epoch": 0.19,
97
+ "learning_rate": 2.4000000000000003e-06,
98
+ "logits/chosen": 0.7160875797271729,
99
+ "logits/rejected": 0.8556329607963562,
100
+ "logps/chosen": -185.2340087890625,
101
+ "logps/rejected": -124.66600036621094,
102
+ "loss": 0.0001,
103
+ "rewards/accuracies": 0.4312500059604645,
104
+ "rewards/chosen": 0.0006271885358728468,
105
+ "rewards/margins": 0.0005722854984924197,
106
+ "rewards/rejected": 5.4903095588088036e-05,
107
+ "step": 60
108
+ },
109
+ {
110
+ "epoch": 0.22,
111
+ "learning_rate": 2.8000000000000003e-06,
112
+ "logits/chosen": 0.6916844844818115,
113
+ "logits/rejected": 0.8196538090705872,
114
+ "logps/chosen": -161.62498474121094,
115
+ "logps/rejected": -107.52471923828125,
116
+ "loss": 0.0001,
117
+ "rewards/accuracies": 0.48750001192092896,
118
+ "rewards/chosen": 0.0007440428016707301,
119
+ "rewards/margins": 0.0006342666456475854,
120
+ "rewards/rejected": 0.00010977611964335665,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.26,
125
+ "learning_rate": 3.2000000000000003e-06,
126
+ "logits/chosen": 0.6701158285140991,
127
+ "logits/rejected": 0.8293789029121399,
128
+ "logps/chosen": -177.7081298828125,
129
+ "logps/rejected": -122.1727294921875,
130
+ "loss": 0.0001,
131
+ "rewards/accuracies": 0.512499988079071,
132
+ "rewards/chosen": 0.0014677448198199272,
133
+ "rewards/margins": 0.000520342611707747,
134
+ "rewards/rejected": 0.0009474022081121802,
135
+ "step": 80
136
+ },
137
+ {
138
+ "epoch": 0.29,
139
+ "learning_rate": 3.6000000000000003e-06,
140
+ "logits/chosen": 0.6228800415992737,
141
+ "logits/rejected": 0.7212022542953491,
142
+ "logps/chosen": -183.03811645507812,
143
+ "logps/rejected": -139.0586700439453,
144
+ "loss": 0.0001,
145
+ "rewards/accuracies": 0.53125,
146
+ "rewards/chosen": 0.0022542846854776144,
147
+ "rewards/margins": 0.0009365282021462917,
148
+ "rewards/rejected": 0.0013177564833313227,
149
+ "step": 90
150
+ },
151
+ {
152
+ "epoch": 0.32,
153
+ "learning_rate": 4.000000000000001e-06,
154
+ "logits/chosen": 0.6344588994979858,
155
+ "logits/rejected": 0.8314794301986694,
156
+ "logps/chosen": -163.23043823242188,
157
+ "logps/rejected": -121.80052185058594,
158
+ "loss": 0.0001,
159
+ "rewards/accuracies": 0.5375000238418579,
160
+ "rewards/chosen": 0.002225330099463463,
161
+ "rewards/margins": 0.0011866830755025148,
162
+ "rewards/rejected": 0.0010386471403762698,
163
+ "step": 100
164
+ },
165
+ {
166
+ "epoch": 0.32,
167
+ "eval_logits/chosen": 0.8262824416160583,
168
+ "eval_logits/rejected": 0.8840107321739197,
169
+ "eval_logps/chosen": -256.7341003417969,
170
+ "eval_logps/rejected": -233.68739318847656,
171
+ "eval_loss": 0.00015139963943511248,
172
+ "eval_rewards/accuracies": 0.5199999809265137,
173
+ "eval_rewards/chosen": -0.00119913334492594,
174
+ "eval_rewards/margins": 0.0002922326675616205,
175
+ "eval_rewards/rejected": -0.0014913661871105433,
176
+ "eval_runtime": 415.0818,
177
+ "eval_samples_per_second": 4.818,
178
+ "eval_steps_per_second": 1.205,
179
+ "step": 100
180
+ },
181
+ {
182
+ "epoch": 0.35,
183
+ "learning_rate": 4.4e-06,
184
+ "logits/chosen": 0.6864518523216248,
185
+ "logits/rejected": 0.9017802476882935,
186
+ "logps/chosen": -174.11480712890625,
187
+ "logps/rejected": -118.93388366699219,
188
+ "loss": 0.0001,
189
+ "rewards/accuracies": 0.5375000238418579,
190
+ "rewards/chosen": 0.00283862859942019,
191
+ "rewards/margins": 0.0018309386214241385,
192
+ "rewards/rejected": 0.001007690210826695,
193
+ "step": 110
194
+ },
195
+ {
196
+ "epoch": 0.38,
197
+ "learning_rate": 4.800000000000001e-06,
198
+ "logits/chosen": 0.5682826638221741,
199
+ "logits/rejected": 0.6713820695877075,
200
+ "logps/chosen": -189.4928436279297,
201
+ "logps/rejected": -135.20498657226562,
202
+ "loss": 0.0001,
203
+ "rewards/accuracies": 0.5249999761581421,
204
+ "rewards/chosen": 0.0033208150416612625,
205
+ "rewards/margins": 0.001955473329871893,
206
+ "rewards/rejected": 0.0013653415953740478,
207
+ "step": 120
208
+ },
209
+ {
210
+ "epoch": 0.42,
211
+ "learning_rate": 4.999755441268144e-06,
212
+ "logits/chosen": 0.7162402868270874,
213
+ "logits/rejected": 0.8464711904525757,
214
+ "logps/chosen": -175.83523559570312,
215
+ "logps/rejected": -126.2065200805664,
216
+ "loss": 0.0001,
217
+ "rewards/accuracies": 0.5375000238418579,
218
+ "rewards/chosen": 0.0035425268579274416,
219
+ "rewards/margins": 0.0022287473548203707,
220
+ "rewards/rejected": 0.0013137792702764273,
221
+ "step": 130
222
+ },
223
+ {
224
+ "epoch": 0.45,
225
+ "learning_rate": 4.997799258487003e-06,
226
+ "logits/chosen": 0.6711681485176086,
227
+ "logits/rejected": 0.8102658987045288,
228
+ "logps/chosen": -182.16258239746094,
229
+ "logps/rejected": -128.60398864746094,
230
+ "loss": 0.0001,
231
+ "rewards/accuracies": 0.5375000238418579,
232
+ "rewards/chosen": 0.0029964938294142485,
233
+ "rewards/margins": 0.002129464875906706,
234
+ "rewards/rejected": 0.0008670290117152035,
235
+ "step": 140
236
+ },
237
+ {
238
+ "epoch": 0.48,
239
+ "learning_rate": 4.993888423734898e-06,
240
+ "logits/chosen": 0.6097667217254639,
241
+ "logits/rejected": 0.7144483327865601,
242
+ "logps/chosen": -190.95156860351562,
243
+ "logps/rejected": -141.19229125976562,
244
+ "loss": 0.0001,
245
+ "rewards/accuracies": 0.59375,
246
+ "rewards/chosen": 0.0037058107554912567,
247
+ "rewards/margins": 0.002363733481615782,
248
+ "rewards/rejected": 0.001342077157460153,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.51,
253
+ "learning_rate": 4.988025997434253e-06,
254
+ "logits/chosen": 0.6315719485282898,
255
+ "logits/rejected": 0.7987428307533264,
256
+ "logps/chosen": -181.68386840820312,
257
+ "logps/rejected": -128.19888305664062,
258
+ "loss": 0.0001,
259
+ "rewards/accuracies": 0.5562499761581421,
260
+ "rewards/chosen": 0.002868856769055128,
261
+ "rewards/margins": 0.0025774172972887754,
262
+ "rewards/rejected": 0.00029143941355869174,
263
+ "step": 160
264
+ },
265
+ {
266
+ "epoch": 0.54,
267
+ "learning_rate": 4.980216567224801e-06,
268
+ "logits/chosen": 0.6035071611404419,
269
+ "logits/rejected": 0.8083668947219849,
270
+ "logps/chosen": -181.84107971191406,
271
+ "logps/rejected": -126.97474670410156,
272
+ "loss": 0.0001,
273
+ "rewards/accuracies": 0.612500011920929,
274
+ "rewards/chosen": 0.0037610617000609636,
275
+ "rewards/margins": 0.0028130013961344957,
276
+ "rewards/rejected": 0.0009480599546805024,
277
+ "step": 170
278
+ },
279
+ {
280
+ "epoch": 0.58,
281
+ "learning_rate": 4.970466244373527e-06,
282
+ "logits/chosen": 0.6453949213027954,
283
+ "logits/rejected": 0.797042727470398,
284
+ "logps/chosen": -155.79519653320312,
285
+ "logps/rejected": -113.65773010253906,
286
+ "loss": 0.0001,
287
+ "rewards/accuracies": 0.574999988079071,
288
+ "rewards/chosen": 0.0029109427705407143,
289
+ "rewards/margins": 0.0023111128248274326,
290
+ "rewards/rejected": 0.0005998298292979598,
291
+ "step": 180
292
+ },
293
+ {
294
+ "epoch": 0.61,
295
+ "learning_rate": 4.958782658992307e-06,
296
+ "logits/chosen": 0.6763242483139038,
297
+ "logits/rejected": 0.8502823710441589,
298
+ "logps/chosen": -159.13275146484375,
299
+ "logps/rejected": -120.99656677246094,
300
+ "loss": 0.0001,
301
+ "rewards/accuracies": 0.4749999940395355,
302
+ "rewards/chosen": 0.0026291562244296074,
303
+ "rewards/margins": 0.0016078405315056443,
304
+ "rewards/rejected": 0.001021315692923963,
305
+ "step": 190
306
+ },
307
+ {
308
+ "epoch": 0.64,
309
+ "learning_rate": 4.945174954066957e-06,
310
+ "logits/chosen": 0.7041156888008118,
311
+ "logits/rejected": 0.7947753667831421,
312
+ "logps/chosen": -177.0174560546875,
313
+ "logps/rejected": -127.36385345458984,
314
+ "loss": 0.0001,
315
+ "rewards/accuracies": 0.625,
316
+ "rewards/chosen": 0.003798459889367223,
317
+ "rewards/margins": 0.0031563788652420044,
318
+ "rewards/rejected": 0.0006420810823328793,
319
+ "step": 200
320
+ },
321
+ {
322
+ "epoch": 0.64,
323
+ "eval_logits/chosen": 0.8200604319572449,
324
+ "eval_logits/rejected": 0.8777603507041931,
325
+ "eval_logps/chosen": -256.8277893066406,
326
+ "eval_logps/rejected": -233.76914978027344,
327
+ "eval_loss": 0.000158556635142304,
328
+ "eval_rewards/accuracies": 0.5005000233650208,
329
+ "eval_rewards/chosen": -0.00213597621768713,
330
+ "eval_rewards/margins": 0.00017285306239500642,
331
+ "eval_rewards/rejected": -0.0023088292218744755,
332
+ "eval_runtime": 412.468,
333
+ "eval_samples_per_second": 4.849,
334
+ "eval_steps_per_second": 1.212,
335
+ "step": 200
336
+ },
337
+ {
338
+ "epoch": 0.67,
339
+ "learning_rate": 4.929653778302397e-06,
340
+ "logits/chosen": 0.6099511981010437,
341
+ "logits/rejected": 0.811872661113739,
342
+ "logps/chosen": -192.26446533203125,
343
+ "logps/rejected": -131.25962829589844,
344
+ "loss": 0.0001,
345
+ "rewards/accuracies": 0.6499999761581421,
346
+ "rewards/chosen": 0.0029747250955551863,
347
+ "rewards/margins": 0.0029428431298583746,
348
+ "rewards/rejected": 3.1882118491921574e-05,
349
+ "step": 210
350
+ },
351
+ {
352
+ "epoch": 0.7,
353
+ "learning_rate": 4.912231277789509e-06,
354
+ "logits/chosen": 0.6272088289260864,
355
+ "logits/rejected": 0.767717719078064,
356
+ "logps/chosen": -193.66490173339844,
357
+ "logps/rejected": -143.585205078125,
358
+ "loss": 0.0001,
359
+ "rewards/accuracies": 0.6000000238418579,
360
+ "rewards/chosen": 0.003646609140560031,
361
+ "rewards/margins": 0.0021339866798371077,
362
+ "rewards/rejected": 0.0015126224607229233,
363
+ "step": 220
364
+ },
365
+ {
366
+ "epoch": 0.74,
367
+ "learning_rate": 4.892921086500219e-06,
368
+ "logits/chosen": 0.6231340169906616,
369
+ "logits/rejected": 0.7231715321540833,
370
+ "logps/chosen": -160.1138458251953,
371
+ "logps/rejected": -114.69986724853516,
372
+ "loss": 0.0001,
373
+ "rewards/accuracies": 0.6000000238418579,
374
+ "rewards/chosen": 0.003955821972340345,
375
+ "rewards/margins": 0.003026761580258608,
376
+ "rewards/rejected": 0.0009290605084970593,
377
+ "step": 230
378
+ },
379
+ {
380
+ "epoch": 0.77,
381
+ "learning_rate": 4.871738315618236e-06,
382
+ "logits/chosen": 0.7013573050498962,
383
+ "logits/rejected": 0.8051062822341919,
384
+ "logps/chosen": -186.76821899414062,
385
+ "logps/rejected": -135.32489013671875,
386
+ "loss": 0.0001,
387
+ "rewards/accuracies": 0.59375,
388
+ "rewards/chosen": 0.003390461904928088,
389
+ "rewards/margins": 0.002614812459796667,
390
+ "rewards/rejected": 0.0007756495615467429,
391
+ "step": 240
392
+ },
393
+ {
394
+ "epoch": 0.8,
395
+ "learning_rate": 4.848699541713801e-06,
396
+ "logits/chosen": 0.6483660340309143,
397
+ "logits/rejected": 0.8247146606445312,
398
+ "logps/chosen": -159.59156799316406,
399
+ "logps/rejected": -118.78582763671875,
400
+ "loss": 0.0001,
401
+ "rewards/accuracies": 0.5375000238418579,
402
+ "rewards/chosen": 0.0031985179521143436,
403
+ "rewards/margins": 0.0021302015520632267,
404
+ "rewards/rejected": 0.0010683165164664388,
405
+ "step": 250
406
+ },
407
+ {
408
+ "epoch": 0.83,
409
+ "learning_rate": 4.823822793771696e-06,
410
+ "logits/chosen": 0.625135064125061,
411
+ "logits/rejected": 0.7819581627845764,
412
+ "logps/chosen": -169.98837280273438,
413
+ "logps/rejected": -119.9397964477539,
414
+ "loss": 0.0001,
415
+ "rewards/accuracies": 0.574999988079071,
416
+ "rewards/chosen": 0.00324636441655457,
417
+ "rewards/margins": 0.0028731045313179493,
418
+ "rewards/rejected": 0.0003732596233021468,
419
+ "step": 260
420
+ },
421
+ {
422
+ "epoch": 0.86,
423
+ "learning_rate": 4.797127539082669e-06,
424
+ "logits/chosen": 0.649788498878479,
425
+ "logits/rejected": 0.8432229161262512,
426
+ "logps/chosen": -191.59213256835938,
427
+ "logps/rejected": -134.23428344726562,
428
+ "loss": 0.0001,
429
+ "rewards/accuracies": 0.574999988079071,
430
+ "rewards/chosen": 0.0034480884205549955,
431
+ "rewards/margins": 0.002935568569228053,
432
+ "rewards/rejected": 0.0005125202005729079,
433
+ "step": 270
434
+ },
435
+ {
436
+ "epoch": 0.9,
437
+ "learning_rate": 4.7686346680093135e-06,
438
+ "logits/chosen": 0.6038953065872192,
439
+ "logits/rejected": 0.8061636686325073,
440
+ "logps/chosen": -188.86422729492188,
441
+ "logps/rejected": -125.95369720458984,
442
+ "loss": 0.0001,
443
+ "rewards/accuracies": 0.71875,
444
+ "rewards/chosen": 0.004464815836399794,
445
+ "rewards/margins": 0.004056466277688742,
446
+ "rewards/rejected": 0.0004083492676727474,
447
+ "step": 280
448
+ },
449
+ {
450
+ "epoch": 0.93,
451
+ "learning_rate": 4.738366477638319e-06,
452
+ "logits/chosen": 0.5706946849822998,
453
+ "logits/rejected": 0.7165879011154175,
454
+ "logps/chosen": -172.26846313476562,
455
+ "logps/rejected": -113.09400939941406,
456
+ "loss": 0.0001,
457
+ "rewards/accuracies": 0.637499988079071,
458
+ "rewards/chosen": 0.0033409663010388613,
459
+ "rewards/margins": 0.0031195811461657286,
460
+ "rewards/rejected": 0.0002213849511463195,
461
+ "step": 290
462
+ },
463
+ {
464
+ "epoch": 0.96,
465
+ "learning_rate": 4.7063466543318965e-06,
466
+ "logits/chosen": 0.6467469334602356,
467
+ "logits/rejected": 0.8215667605400085,
468
+ "logps/chosen": -171.28201293945312,
469
+ "logps/rejected": -121.31685638427734,
470
+ "loss": 0.0001,
471
+ "rewards/accuracies": 0.606249988079071,
472
+ "rewards/chosen": 0.00259224371984601,
473
+ "rewards/margins": 0.0022495179437100887,
474
+ "rewards/rejected": 0.0003427262417972088,
475
+ "step": 300
476
+ },
477
+ {
478
+ "epoch": 0.96,
479
+ "eval_logits/chosen": 0.8206124901771545,
480
+ "eval_logits/rejected": 0.8782824277877808,
481
+ "eval_logps/chosen": -256.82720947265625,
482
+ "eval_logps/rejected": -233.77804565429688,
483
+ "eval_loss": 0.00015650840941816568,
484
+ "eval_rewards/accuracies": 0.4984999895095825,
485
+ "eval_rewards/chosen": -0.002130264649167657,
486
+ "eval_rewards/margins": 0.00026750334654934704,
487
+ "eval_rewards/rejected": -0.0023977679666131735,
488
+ "eval_runtime": 412.0828,
489
+ "eval_samples_per_second": 4.853,
490
+ "eval_steps_per_second": 1.213,
491
+ "step": 300
492
+ },
493
+ {
494
+ "epoch": 0.99,
495
+ "learning_rate": 4.672600255192022e-06,
496
+ "logits/chosen": 0.6745213270187378,
497
+ "logits/rejected": 0.7582186460494995,
498
+ "logps/chosen": -177.1463623046875,
499
+ "logps/rejected": -121.45220947265625,
500
+ "loss": 0.0001,
501
+ "rewards/accuracies": 0.59375,
502
+ "rewards/chosen": 0.003095152322202921,
503
+ "rewards/margins": 0.002747387159615755,
504
+ "rewards/rejected": 0.00034776475513353944,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 1.02,
509
+ "learning_rate": 4.6371536884520115e-06,
510
+ "logits/chosen": 0.6856507062911987,
511
+ "logits/rejected": 0.8083009719848633,
512
+ "logps/chosen": -175.43466186523438,
513
+ "logps/rejected": -113.60682678222656,
514
+ "loss": 0.0001,
515
+ "rewards/accuracies": 0.5625,
516
+ "rewards/chosen": 0.0029540827963501215,
517
+ "rewards/margins": 0.0019870868418365717,
518
+ "rewards/rejected": 0.0009669959545135498,
519
+ "step": 320
520
+ },
521
+ {
522
+ "epoch": 1.06,
523
+ "learning_rate": 4.600034692810764e-06,
524
+ "logits/chosen": 0.6629844307899475,
525
+ "logits/rejected": 0.831767737865448,
526
+ "logps/chosen": -181.02310180664062,
527
+ "logps/rejected": -119.25450134277344,
528
+ "loss": 0.0001,
529
+ "rewards/accuracies": 0.643750011920929,
530
+ "rewards/chosen": 0.0041047511622309685,
531
+ "rewards/margins": 0.003629709128290415,
532
+ "rewards/rejected": 0.0004750423540826887,
533
+ "step": 330
534
+ },
535
+ {
536
+ "epoch": 1.09,
537
+ "learning_rate": 4.561272315725852e-06,
538
+ "logits/chosen": 0.6184499263763428,
539
+ "logits/rejected": 0.7539141178131104,
540
+ "logps/chosen": -186.89682006835938,
541
+ "logps/rejected": -139.5793914794922,
542
+ "loss": 0.0001,
543
+ "rewards/accuracies": 0.606249988079071,
544
+ "rewards/chosen": 0.0036196750588715076,
545
+ "rewards/margins": 0.0029566895682364702,
546
+ "rewards/rejected": 0.0006629853160120547,
547
+ "step": 340
548
+ },
549
+ {
550
+ "epoch": 1.12,
551
+ "learning_rate": 4.520896890682449e-06,
552
+ "logits/chosen": 0.6621273159980774,
553
+ "logits/rejected": 0.7836776971817017,
554
+ "logps/chosen": -179.44253540039062,
555
+ "logps/rejected": -126.19744873046875,
556
+ "loss": 0.0001,
557
+ "rewards/accuracies": 0.612500011920929,
558
+ "rewards/chosen": 0.003894126508384943,
559
+ "rewards/margins": 0.003107634838670492,
560
+ "rewards/rejected": 0.0007864916697144508,
561
+ "step": 350
562
+ },
563
+ {
564
+ "epoch": 1.15,
565
+ "learning_rate": 4.478940013455864e-06,
566
+ "logits/chosen": 0.6130011677742004,
567
+ "logits/rejected": 0.7723643183708191,
568
+ "logps/chosen": -179.69866943359375,
569
+ "logps/rejected": -119.17742919921875,
570
+ "loss": 0.0001,
571
+ "rewards/accuracies": 0.675000011920929,
572
+ "rewards/chosen": 0.0030810669995844364,
573
+ "rewards/margins": 0.0031474686693400145,
574
+ "rewards/rejected": -6.640238279942423e-05,
575
+ "step": 360
576
+ },
577
+ {
578
+ "epoch": 1.18,
579
+ "learning_rate": 4.435434517386281e-06,
580
+ "logits/chosen": 0.6930996179580688,
581
+ "logits/rejected": 0.8360698819160461,
582
+ "logps/chosen": -189.090087890625,
583
+ "logps/rejected": -135.1619873046875,
584
+ "loss": 0.0001,
585
+ "rewards/accuracies": 0.612500011920929,
586
+ "rewards/chosen": 0.003456123173236847,
587
+ "rewards/margins": 0.0029000742360949516,
588
+ "rewards/rejected": 0.0005560485878959298,
589
+ "step": 370
590
+ },
591
+ {
592
+ "epoch": 1.22,
593
+ "learning_rate": 4.39041444768504e-06,
594
+ "logits/chosen": 0.6620668172836304,
595
+ "logits/rejected": 0.853411853313446,
596
+ "logps/chosen": -177.2210235595703,
597
+ "logps/rejected": -124.18900299072266,
598
+ "loss": 0.0001,
599
+ "rewards/accuracies": 0.643750011920929,
600
+ "rewards/chosen": 0.00355343846604228,
601
+ "rewards/margins": 0.003507003653794527,
602
+ "rewards/rejected": 4.643518332159147e-05,
603
+ "step": 380
604
+ },
605
+ {
606
+ "epoch": 1.25,
607
+ "learning_rate": 4.343915034792569e-06,
608
+ "logits/chosen": 0.629216730594635,
609
+ "logits/rejected": 0.7686316967010498,
610
+ "logps/chosen": -184.9043426513672,
611
+ "logps/rejected": -130.17727661132812,
612
+ "loss": 0.0001,
613
+ "rewards/accuracies": 0.6499999761581421,
614
+ "rewards/chosen": 0.003557362360879779,
615
+ "rewards/margins": 0.003719041822478175,
616
+ "rewards/rejected": -0.00016167931607924402,
617
+ "step": 390
618
+ },
619
+ {
620
+ "epoch": 1.28,
621
+ "learning_rate": 4.295972666808811e-06,
622
+ "logits/chosen": 0.6529077291488647,
623
+ "logits/rejected": 0.8112252354621887,
624
+ "logps/chosen": -172.85665893554688,
625
+ "logps/rejected": -127.3191146850586,
626
+ "loss": 0.0001,
627
+ "rewards/accuracies": 0.6312500238418579,
628
+ "rewards/chosen": 0.0037923946511000395,
629
+ "rewards/margins": 0.003340603783726692,
630
+ "rewards/rejected": 0.00045179054723121226,
631
+ "step": 400
632
+ },
633
+ {
634
+ "epoch": 1.28,
635
+ "eval_logits/chosen": 0.8192203044891357,
636
+ "eval_logits/rejected": 0.8769342303276062,
637
+ "eval_logps/chosen": -256.87567138671875,
638
+ "eval_logps/rejected": -233.8277130126953,
639
+ "eval_loss": 0.00015775366046000272,
640
+ "eval_rewards/accuracies": 0.5195000171661377,
641
+ "eval_rewards/chosen": -0.002614969853311777,
642
+ "eval_rewards/margins": 0.00027956385747529566,
643
+ "eval_rewards/rejected": -0.002894533798098564,
644
+ "eval_runtime": 412.2856,
645
+ "eval_samples_per_second": 4.851,
646
+ "eval_steps_per_second": 1.213,
647
+ "step": 400
648
+ },
649
+ {
650
+ "epoch": 1.31,
651
+ "learning_rate": 4.246624861017732e-06,
652
+ "logits/chosen": 0.625112771987915,
653
+ "logits/rejected": 0.7611836791038513,
654
+ "logps/chosen": -178.12124633789062,
655
+ "logps/rejected": -135.6422882080078,
656
+ "loss": 0.0001,
657
+ "rewards/accuracies": 0.6499999761581421,
658
+ "rewards/chosen": 0.004022772889584303,
659
+ "rewards/margins": 0.003282977268099785,
660
+ "rewards/rejected": 0.0007397954468615353,
661
+ "step": 410
662
+ },
663
+ {
664
+ "epoch": 1.34,
665
+ "learning_rate": 4.195910234528186e-06,
666
+ "logits/chosen": 0.6460477709770203,
667
+ "logits/rejected": 0.7641812562942505,
668
+ "logps/chosen": -168.17930603027344,
669
+ "logps/rejected": -124.3600082397461,
670
+ "loss": 0.0001,
671
+ "rewards/accuracies": 0.5874999761581421,
672
+ "rewards/chosen": 0.0026535852812230587,
673
+ "rewards/margins": 0.0026994948275387287,
674
+ "rewards/rejected": -4.590977187035605e-05,
675
+ "step": 420
676
+ },
677
+ {
678
+ "epoch": 1.38,
679
+ "learning_rate": 4.143868474054098e-06,
680
+ "logits/chosen": 0.5987478494644165,
681
+ "logits/rejected": 0.7457458972930908,
682
+ "logps/chosen": -189.61422729492188,
683
+ "logps/rejected": -123.73170471191406,
684
+ "loss": 0.0001,
685
+ "rewards/accuracies": 0.643750011920929,
686
+ "rewards/chosen": 0.003896042238920927,
687
+ "rewards/margins": 0.00355791044421494,
688
+ "rewards/rejected": 0.00033813173649832606,
689
+ "step": 430
690
+ },
691
+ {
692
+ "epoch": 1.41,
693
+ "learning_rate": 4.0905403048576545e-06,
694
+ "logits/chosen": 0.5944562554359436,
695
+ "logits/rejected": 0.7794451713562012,
696
+ "logps/chosen": -173.83494567871094,
697
+ "logps/rejected": -125.55583190917969,
698
+ "loss": 0.0001,
699
+ "rewards/accuracies": 0.581250011920929,
700
+ "rewards/chosen": 0.003923391457647085,
701
+ "rewards/margins": 0.003630859311670065,
702
+ "rewards/rejected": 0.00029253208776935935,
703
+ "step": 440
704
+ },
705
+ {
706
+ "epoch": 1.44,
707
+ "learning_rate": 4.035967458879751e-06,
708
+ "logits/chosen": 0.5830662846565247,
709
+ "logits/rejected": 0.7792474031448364,
710
+ "logps/chosen": -174.3615264892578,
711
+ "logps/rejected": -130.67514038085938,
712
+ "loss": 0.0001,
713
+ "rewards/accuracies": 0.643750011920929,
714
+ "rewards/chosen": 0.0029520168900489807,
715
+ "rewards/margins": 0.0029378398321568966,
716
+ "rewards/rejected": 1.4177237972035073e-05,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 1.47,
721
+ "learning_rate": 3.980192642082682e-06,
722
+ "logits/chosen": 0.6810758709907532,
723
+ "logits/rejected": 0.7834008932113647,
724
+ "logps/chosen": -168.80197143554688,
725
+ "logps/rejected": -118.42387390136719,
726
+ "loss": 0.0001,
727
+ "rewards/accuracies": 0.581250011920929,
728
+ "rewards/chosen": 0.0025485253427177668,
729
+ "rewards/margins": 0.002175524365156889,
730
+ "rewards/rejected": 0.00037300080293789506,
731
+ "step": 460
732
+ },
733
+ {
734
+ "epoch": 1.5,
735
+ "learning_rate": 3.923259501030604e-06,
736
+ "logits/chosen": 0.6081482768058777,
737
+ "logits/rejected": 0.8008432388305664,
738
+ "logps/chosen": -158.7695770263672,
739
+ "logps/rejected": -117.162109375,
740
+ "loss": 0.0001,
741
+ "rewards/accuracies": 0.5687500238418579,
742
+ "rewards/chosen": 0.003074315609410405,
743
+ "rewards/margins": 0.0024462412111461163,
744
+ "rewards/rejected": 0.0006280745146796107,
745
+ "step": 470
746
+ },
747
+ {
748
+ "epoch": 1.54,
749
+ "learning_rate": 3.865212588733927e-06,
750
+ "logits/chosen": 0.5983234643936157,
751
+ "logits/rejected": 0.8347161412239075,
752
+ "logps/chosen": -177.07232666015625,
753
+ "logps/rejected": -113.17292785644531,
754
+ "loss": 0.0001,
755
+ "rewards/accuracies": 0.5687500238418579,
756
+ "rewards/chosen": 0.004156365990638733,
757
+ "rewards/margins": 0.0032986297737807035,
758
+ "rewards/rejected": 0.0008577358676120639,
759
+ "step": 480
760
+ },
761
+ {
762
+ "epoch": 1.57,
763
+ "learning_rate": 3.8060973297843773e-06,
764
+ "logits/chosen": 0.6949520111083984,
765
+ "logits/rejected": 0.8520463109016418,
766
+ "logps/chosen": -180.8011016845703,
767
+ "logps/rejected": -135.38951110839844,
768
+ "loss": 0.0001,
769
+ "rewards/accuracies": 0.59375,
770
+ "rewards/chosen": 0.0031148726120591164,
771
+ "rewards/margins": 0.0025527984835207462,
772
+ "rewards/rejected": 0.000562074186746031,
773
+ "step": 490
774
+ },
775
+ {
776
+ "epoch": 1.6,
777
+ "learning_rate": 3.7459599848079965e-06,
778
+ "logits/chosen": 0.6497796773910522,
779
+ "logits/rejected": 0.8080886006355286,
780
+ "logps/chosen": -187.10287475585938,
781
+ "logps/rejected": -133.8563995361328,
782
+ "loss": 0.0001,
783
+ "rewards/accuracies": 0.574999988079071,
784
+ "rewards/chosen": 0.004290526732802391,
785
+ "rewards/margins": 0.0023678760044276714,
786
+ "rewards/rejected": 0.0019226508447900414,
787
+ "step": 500
788
+ },
789
+ {
790
+ "epoch": 1.6,
791
+ "eval_logits/chosen": 0.8150748610496521,
792
+ "eval_logits/rejected": 0.8728834986686707,
793
+ "eval_logps/chosen": -256.8869323730469,
794
+ "eval_logps/rejected": -233.8388214111328,
795
+ "eval_loss": 0.0001666269963607192,
796
+ "eval_rewards/accuracies": 0.5170000195503235,
797
+ "eval_rewards/chosen": -0.0027276412583887577,
798
+ "eval_rewards/margins": 0.00027794469497166574,
799
+ "eval_rewards/rejected": -0.0030055860988795757,
800
+ "eval_runtime": 414.2028,
801
+ "eval_samples_per_second": 4.829,
802
+ "eval_steps_per_second": 1.207,
803
+ "step": 500
804
+ },
805
+ {
806
+ "epoch": 1.63,
807
+ "learning_rate": 3.684847614263898e-06,
808
+ "logits/chosen": 0.649914562702179,
809
+ "logits/rejected": 0.8345224261283875,
810
+ "logps/chosen": -170.88528442382812,
811
+ "logps/rejected": -113.5182876586914,
812
+ "loss": 0.0001,
813
+ "rewards/accuracies": 0.6312500238418579,
814
+ "rewards/chosen": 0.004132578149437904,
815
+ "rewards/margins": 0.004135853610932827,
816
+ "rewards/rejected": -3.27564202962094e-06,
817
+ "step": 510
818
+ },
819
+ {
820
+ "epoch": 1.66,
821
+ "learning_rate": 3.622808041617133e-06,
822
+ "logits/chosen": 0.6696175336837769,
823
+ "logits/rejected": 0.8282491564750671,
824
+ "logps/chosen": -165.21356201171875,
825
+ "logps/rejected": -120.80168151855469,
826
+ "loss": 0.0001,
827
+ "rewards/accuracies": 0.6187499761581421,
828
+ "rewards/chosen": 0.003804347710683942,
829
+ "rewards/margins": 0.002934789750725031,
830
+ "rewards/rejected": 0.0008695581927895546,
831
+ "step": 520
832
+ },
833
+ {
834
+ "epoch": 1.7,
835
+ "learning_rate": 3.559889815914441e-06,
836
+ "logits/chosen": 0.6223723888397217,
837
+ "logits/rejected": 0.8060038685798645,
838
+ "logps/chosen": -173.92562866210938,
839
+ "logps/rejected": -115.7896728515625,
840
+ "loss": 0.0001,
841
+ "rewards/accuracies": 0.6499999761581421,
842
+ "rewards/chosen": 0.0046675438061356544,
843
+ "rewards/margins": 0.0034386657644063234,
844
+ "rewards/rejected": 0.001228878041729331,
845
+ "step": 530
846
+ },
847
+ {
848
+ "epoch": 1.73,
849
+ "learning_rate": 3.496142173792219e-06,
850
+ "logits/chosen": 0.5860522389411926,
851
+ "logits/rejected": 0.7538793683052063,
852
+ "logps/chosen": -179.20030212402344,
853
+ "logps/rejected": -126.5418930053711,
854
+ "loss": 0.0001,
855
+ "rewards/accuracies": 0.612500011920929,
856
+ "rewards/chosen": 0.0036121346056461334,
857
+ "rewards/margins": 0.003292589681223035,
858
+ "rewards/rejected": 0.00031954512814991176,
859
+ "step": 540
860
+ },
861
+ {
862
+ "epoch": 1.76,
863
+ "learning_rate": 3.4316150009464023e-06,
864
+ "logits/chosen": 0.6498968601226807,
865
+ "logits/rejected": 0.8035451173782349,
866
+ "logps/chosen": -192.52578735351562,
867
+ "logps/rejected": -139.58251953125,
868
+ "loss": 0.0001,
869
+ "rewards/accuracies": 0.606249988079071,
870
+ "rewards/chosen": 0.003925256431102753,
871
+ "rewards/margins": 0.0028843428008258343,
872
+ "rewards/rejected": 0.001040913863107562,
873
+ "step": 550
874
+ },
875
+ {
876
+ "epoch": 1.79,
877
+ "learning_rate": 3.366358793094433e-06,
878
+ "logits/chosen": 0.5884669423103333,
879
+ "logits/rejected": 0.8008158802986145,
880
+ "logps/chosen": -179.25753784179688,
881
+ "logps/rejected": -120.347412109375,
882
+ "loss": 0.0001,
883
+ "rewards/accuracies": 0.59375,
884
+ "rewards/chosen": 0.003893566783517599,
885
+ "rewards/margins": 0.0027398644015192986,
886
+ "rewards/rejected": 0.0011537026148289442,
887
+ "step": 560
888
+ },
889
+ {
890
+ "epoch": 1.82,
891
+ "learning_rate": 3.3004246164598535e-06,
892
+ "logits/chosen": 0.6389291286468506,
893
+ "logits/rejected": 0.8485361933708191,
894
+ "logps/chosen": -190.9259490966797,
895
+ "logps/rejected": -140.17202758789062,
896
+ "loss": 0.0001,
897
+ "rewards/accuracies": 0.675000011920929,
898
+ "rewards/chosen": 0.004239690490067005,
899
+ "rewards/margins": 0.003860587952658534,
900
+ "rewards/rejected": 0.0003791024792008102,
901
+ "step": 570
902
+ },
903
+ {
904
+ "epoch": 1.86,
905
+ "learning_rate": 3.233864067810446e-06,
906
+ "logits/chosen": 0.6482547521591187,
907
+ "logits/rejected": 0.8119586110115051,
908
+ "logps/chosen": -183.17164611816406,
909
+ "logps/rejected": -119.36767578125,
910
+ "loss": 0.0001,
911
+ "rewards/accuracies": 0.6499999761581421,
912
+ "rewards/chosen": 0.004655472934246063,
913
+ "rewards/margins": 0.0037689208984375,
914
+ "rewards/rejected": 0.0008865518611855805,
915
+ "step": 580
916
+ },
917
+ {
918
+ "epoch": 1.89,
919
+ "learning_rate": 3.1667292340812077e-06,
920
+ "logits/chosen": 0.6013578176498413,
921
+ "logits/rejected": 0.7613108158111572,
922
+ "logps/chosen": -181.44009399414062,
923
+ "logps/rejected": -140.5045166015625,
924
+ "loss": 0.0001,
925
+ "rewards/accuracies": 0.625,
926
+ "rewards/chosen": 0.004445691592991352,
927
+ "rewards/margins": 0.003334993962198496,
928
+ "rewards/rejected": 0.0011106978636234999,
929
+ "step": 590
930
+ },
931
+ {
932
+ "epoch": 1.92,
933
+ "learning_rate": 3.099072651613728e-06,
934
+ "logits/chosen": 0.7029744386672974,
935
+ "logits/rejected": 0.8172439336776733,
936
+ "logps/chosen": -165.12625122070312,
937
+ "logps/rejected": -119.59954833984375,
938
+ "loss": 0.0001,
939
+ "rewards/accuracies": 0.612500011920929,
940
+ "rewards/chosen": 0.0032034076284617186,
941
+ "rewards/margins": 0.0023894021287560463,
942
+ "rewards/rejected": 0.0008140054414980114,
943
+ "step": 600
944
+ },
945
+ {
946
+ "epoch": 1.92,
947
+ "eval_logits/chosen": 0.817969799041748,
948
+ "eval_logits/rejected": 0.8756601810455322,
949
+ "eval_logps/chosen": -256.885986328125,
950
+ "eval_logps/rejected": -233.84141540527344,
951
+ "eval_loss": 0.00016333417443092912,
952
+ "eval_rewards/accuracies": 0.5070000290870667,
953
+ "eval_rewards/chosen": -0.0027182498015463352,
954
+ "eval_rewards/margins": 0.0003131589328404516,
955
+ "eval_rewards/rejected": -0.003031408879905939,
956
+ "eval_runtime": 412.1696,
957
+ "eval_samples_per_second": 4.852,
958
+ "eval_steps_per_second": 1.213,
959
+ "step": 600
960
+ },
961
+ {
962
+ "epoch": 1.95,
963
+ "learning_rate": 3.0309472650438982e-06,
964
+ "logits/chosen": 0.7164211869239807,
965
+ "logits/rejected": 0.8135250806808472,
966
+ "logps/chosen": -167.83938598632812,
967
+ "logps/rejected": -123.3685531616211,
968
+ "loss": 0.0001,
969
+ "rewards/accuracies": 0.606249988079071,
970
+ "rewards/chosen": 0.004080263432115316,
971
+ "rewards/margins": 0.002866895869374275,
972
+ "rewards/rejected": 0.0012133677955716848,
973
+ "step": 610
974
+ },
975
+ {
976
+ "epoch": 1.98,
977
+ "learning_rate": 2.9624063858701006e-06,
978
+ "logits/chosen": 0.6123205423355103,
979
+ "logits/rejected": 0.8067408800125122,
980
+ "logps/chosen": -174.879638671875,
981
+ "logps/rejected": -119.88896179199219,
982
+ "loss": 0.0001,
983
+ "rewards/accuracies": 0.6000000238418579,
984
+ "rewards/chosen": 0.0032961233519017696,
985
+ "rewards/margins": 0.0031299355905503035,
986
+ "rewards/rejected": 0.00016618790687061846,
987
+ "step": 620
988
+ },
989
+ {
990
+ "epoch": 2.02,
991
+ "learning_rate": 2.8935036507343185e-06,
992
+ "logits/chosen": 0.6171947717666626,
993
+ "logits/rejected": 0.7731830477714539,
994
+ "logps/chosen": -173.52308654785156,
995
+ "logps/rejected": -122.30467224121094,
996
+ "loss": 0.0001,
997
+ "rewards/accuracies": 0.6187499761581421,
998
+ "rewards/chosen": 0.004019217099994421,
999
+ "rewards/margins": 0.003546707332134247,
1000
+ "rewards/rejected": 0.0004725100880023092,
1001
+ "step": 630
1002
+ },
1003
+ {
1004
+ "epoch": 2.05,
1005
+ "learning_rate": 2.8242929794487926e-06,
1006
+ "logits/chosen": 0.6720027923583984,
1007
+ "logits/rejected": 0.8209166526794434,
1008
+ "logps/chosen": -167.04946899414062,
1009
+ "logps/rejected": -120.01603698730469,
1010
+ "loss": 0.0001,
1011
+ "rewards/accuracies": 0.6625000238418579,
1012
+ "rewards/chosen": 0.004417883697897196,
1013
+ "rewards/margins": 0.00379347731359303,
1014
+ "rewards/rejected": 0.000624406267888844,
1015
+ "step": 640
1016
+ },
1017
+ {
1018
+ "epoch": 2.08,
1019
+ "learning_rate": 2.7548285328010984e-06,
1020
+ "logits/chosen": 0.5649908781051636,
1021
+ "logits/rejected": 0.7522573471069336,
1022
+ "logps/chosen": -167.37994384765625,
1023
+ "logps/rejected": -118.6939697265625,
1024
+ "loss": 0.0001,
1025
+ "rewards/accuracies": 0.581250011920929,
1026
+ "rewards/chosen": 0.003759379032999277,
1027
+ "rewards/margins": 0.0030631672125309706,
1028
+ "rewards/rejected": 0.0006962117040529847,
1029
+ "step": 650
1030
+ },
1031
+ {
1032
+ "epoch": 2.11,
1033
+ "learning_rate": 2.6851646701706306e-06,
1034
+ "logits/chosen": 0.7023177742958069,
1035
+ "logits/rejected": 0.8027107119560242,
1036
+ "logps/chosen": -177.06430053710938,
1037
+ "logps/rejected": -129.97714233398438,
1038
+ "loss": 0.0001,
1039
+ "rewards/accuracies": 0.6000000238418579,
1040
+ "rewards/chosen": 0.003546078223735094,
1041
+ "rewards/margins": 0.0027953279204666615,
1042
+ "rewards/rejected": 0.0007507502450607717,
1043
+ "step": 660
1044
+ },
1045
+ {
1046
+ "epoch": 2.14,
1047
+ "learning_rate": 2.6153559069897007e-06,
1048
+ "logits/chosen": 0.6293431520462036,
1049
+ "logits/rejected": 0.7750530242919922,
1050
+ "logps/chosen": -168.621337890625,
1051
+ "logps/rejected": -122.5462875366211,
1052
+ "loss": 0.0001,
1053
+ "rewards/accuracies": 0.606249988079071,
1054
+ "rewards/chosen": 0.003033037530258298,
1055
+ "rewards/margins": 0.0028599021025002003,
1056
+ "rewards/rejected": 0.00017313548596575856,
1057
+ "step": 670
1058
+ },
1059
+ {
1060
+ "epoch": 2.18,
1061
+ "learning_rate": 2.5454568720824937e-06,
1062
+ "logits/chosen": 0.5758141279220581,
1063
+ "logits/rejected": 0.7897475957870483,
1064
+ "logps/chosen": -174.54208374023438,
1065
+ "logps/rejected": -118.59149169921875,
1066
+ "loss": 0.0001,
1067
+ "rewards/accuracies": 0.6875,
1068
+ "rewards/chosen": 0.003853294998407364,
1069
+ "rewards/margins": 0.003840038087219,
1070
+ "rewards/rejected": 1.3256654710858129e-05,
1071
+ "step": 680
1072
+ },
1073
+ {
1074
+ "epoch": 2.21,
1075
+ "learning_rate": 2.4755222649153014e-06,
1076
+ "logits/chosen": 0.6393508911132812,
1077
+ "logits/rejected": 0.8246806263923645,
1078
+ "logps/chosen": -184.72274780273438,
1079
+ "logps/rejected": -126.2255859375,
1080
+ "loss": 0.0001,
1081
+ "rewards/accuracies": 0.6312500238418579,
1082
+ "rewards/chosen": 0.0034170313738286495,
1083
+ "rewards/margins": 0.0032350593246519566,
1084
+ "rewards/rejected": 0.00018197241297457367,
1085
+ "step": 690
1086
+ },
1087
+ {
1088
+ "epoch": 2.24,
1089
+ "learning_rate": 2.4056068127914803e-06,
1090
+ "logits/chosen": 0.7041198015213013,
1091
+ "logits/rejected": 0.8579221963882446,
1092
+ "logps/chosen": -165.37600708007812,
1093
+ "logps/rejected": -113.6267318725586,
1094
+ "loss": 0.0001,
1095
+ "rewards/accuracies": 0.6312500238418579,
1096
+ "rewards/chosen": 0.0038474693428725004,
1097
+ "rewards/margins": 0.004361593164503574,
1098
+ "rewards/rejected": -0.0005141238798387349,
1099
+ "step": 700
1100
+ },
1101
+ {
1102
+ "epoch": 2.24,
1103
+ "eval_logits/chosen": 0.8142222166061401,
1104
+ "eval_logits/rejected": 0.8718712329864502,
1105
+ "eval_logps/chosen": -256.9122619628906,
1106
+ "eval_logps/rejected": -233.85919189453125,
1107
+ "eval_loss": 0.00016846887592691928,
1108
+ "eval_rewards/accuracies": 0.5065000057220459,
1109
+ "eval_rewards/chosen": -0.0029809277039021254,
1110
+ "eval_rewards/margins": 0.00022821790480520576,
1111
+ "eval_rewards/rejected": -0.003209145972505212,
1112
+ "eval_runtime": 412.3465,
1113
+ "eval_samples_per_second": 4.85,
1114
+ "eval_steps_per_second": 1.213,
1115
+ "step": 700
1116
+ },
1117
+ {
1118
+ "epoch": 2.27,
1119
+ "learning_rate": 2.3357652280246125e-06,
1120
+ "logits/chosen": 0.6320663690567017,
1121
+ "logits/rejected": 0.785031259059906,
1122
+ "logps/chosen": -189.28390502929688,
1123
+ "logps/rejected": -144.56167602539062,
1124
+ "loss": 0.0001,
1125
+ "rewards/accuracies": 0.675000011920929,
1126
+ "rewards/chosen": 0.0025972402654588223,
1127
+ "rewards/margins": 0.0028369042556732893,
1128
+ "rewards/rejected": -0.00023966425214894116,
1129
+ "step": 710
1130
+ },
1131
+ {
1132
+ "epoch": 2.3,
1133
+ "learning_rate": 2.2660521651234036e-06,
1134
+ "logits/chosen": 0.6551303267478943,
1135
+ "logits/rejected": 0.8406831622123718,
1136
+ "logps/chosen": -186.629638671875,
1137
+ "logps/rejected": -126.31327819824219,
1138
+ "loss": 0.0001,
1139
+ "rewards/accuracies": 0.606249988079071,
1140
+ "rewards/chosen": 0.0038953598123043776,
1141
+ "rewards/margins": 0.003374651074409485,
1142
+ "rewards/rejected": 0.0005207090289331973,
1143
+ "step": 720
1144
+ },
1145
+ {
1146
+ "epoch": 2.34,
1147
+ "learning_rate": 2.1965221780218173e-06,
1148
+ "logits/chosen": 0.6312896013259888,
1149
+ "logits/rejected": 0.7552987337112427,
1150
+ "logps/chosen": -173.728271484375,
1151
+ "logps/rejected": -124.27177429199219,
1152
+ "loss": 0.0001,
1153
+ "rewards/accuracies": 0.625,
1154
+ "rewards/chosen": 0.004026113077998161,
1155
+ "rewards/margins": 0.0039476132951676846,
1156
+ "rewards/rejected": 7.849968096707016e-05,
1157
+ "step": 730
1158
+ },
1159
+ {
1160
+ "epoch": 2.37,
1161
+ "learning_rate": 2.1272296773879107e-06,
1162
+ "logits/chosen": 0.6133307814598083,
1163
+ "logits/rejected": 0.8152937889099121,
1164
+ "logps/chosen": -178.77232360839844,
1165
+ "logps/rejected": -125.87672424316406,
1166
+ "loss": 0.0001,
1167
+ "rewards/accuracies": 0.6312500238418579,
1168
+ "rewards/chosen": 0.0026045762933790684,
1169
+ "rewards/margins": 0.0032864962704479694,
1170
+ "rewards/rejected": -0.0006819200934842229,
1171
+ "step": 740
1172
+ },
1173
+ {
1174
+ "epoch": 2.4,
1175
+ "learning_rate": 2.058228888044788e-06,
1176
+ "logits/chosen": 0.6092817187309265,
1177
+ "logits/rejected": 0.7013009190559387,
1178
+ "logps/chosen": -165.13404846191406,
1179
+ "logps/rejected": -122.30549621582031,
1180
+ "loss": 0.0001,
1181
+ "rewards/accuracies": 0.6187499761581421,
1182
+ "rewards/chosen": 0.003001943463459611,
1183
+ "rewards/margins": 0.0025921487249433994,
1184
+ "rewards/rejected": 0.0004097948840353638,
1185
+ "step": 750
1186
+ },
1187
+ {
1188
+ "epoch": 2.43,
1189
+ "learning_rate": 1.989573806536978e-06,
1190
+ "logits/chosen": 0.6568277478218079,
1191
+ "logits/rejected": 0.8574051856994629,
1192
+ "logps/chosen": -179.80409240722656,
1193
+ "logps/rejected": -120.96827697753906,
1194
+ "loss": 0.0001,
1195
+ "rewards/accuracies": 0.65625,
1196
+ "rewards/chosen": 0.004101686645299196,
1197
+ "rewards/margins": 0.00385199673473835,
1198
+ "rewards/rejected": 0.0002496893866918981,
1199
+ "step": 760
1200
+ },
1201
+ {
1202
+ "epoch": 2.46,
1203
+ "learning_rate": 1.921318158875459e-06,
1204
+ "logits/chosen": 0.6586846113204956,
1205
+ "logits/rejected": 0.8662702441215515,
1206
+ "logps/chosen": -174.4876708984375,
1207
+ "logps/rejected": -128.13922119140625,
1208
+ "loss": 0.0001,
1209
+ "rewards/accuracies": 0.6000000238418579,
1210
+ "rewards/chosen": 0.004603301174938679,
1211
+ "rewards/margins": 0.0032628674525767565,
1212
+ "rewards/rejected": 0.0013404333731159568,
1213
+ "step": 770
1214
+ },
1215
+ {
1216
+ "epoch": 2.5,
1217
+ "learning_rate": 1.8535153584943915e-06,
1218
+ "logits/chosen": 0.6371630430221558,
1219
+ "logits/rejected": 0.7737770080566406,
1220
+ "logps/chosen": -170.006591796875,
1221
+ "logps/rejected": -126.65934753417969,
1222
+ "loss": 0.0001,
1223
+ "rewards/accuracies": 0.59375,
1224
+ "rewards/chosen": 0.004735985770821571,
1225
+ "rewards/margins": 0.003097447333857417,
1226
+ "rewards/rejected": 0.001638538553379476,
1227
+ "step": 780
1228
+ },
1229
+ {
1230
+ "epoch": 2.53,
1231
+ "learning_rate": 1.7862184644524422e-06,
1232
+ "logits/chosen": 0.6403388977050781,
1233
+ "logits/rejected": 0.7904574275016785,
1234
+ "logps/chosen": -178.15911865234375,
1235
+ "logps/rejected": -133.64309692382812,
1236
+ "loss": 0.0001,
1237
+ "rewards/accuracies": 0.675000011920929,
1238
+ "rewards/chosen": 0.00445596594363451,
1239
+ "rewards/margins": 0.004045891109853983,
1240
+ "rewards/rejected": 0.0004100751248188317,
1241
+ "step": 790
1242
+ },
1243
+ {
1244
+ "epoch": 2.56,
1245
+ "learning_rate": 1.7194801399114471e-06,
1246
+ "logits/chosen": 0.6325648427009583,
1247
+ "logits/rejected": 0.747418999671936,
1248
+ "logps/chosen": -174.4492950439453,
1249
+ "logps/rejected": -125.17801666259766,
1250
+ "loss": 0.0001,
1251
+ "rewards/accuracies": 0.6812499761581421,
1252
+ "rewards/chosen": 0.004687703680247068,
1253
+ "rewards/margins": 0.004329483024775982,
1254
+ "rewards/rejected": 0.00035822103382088244,
1255
+ "step": 800
1256
+ },
1257
+ {
1258
+ "epoch": 2.56,
1259
+ "eval_logits/chosen": 0.8135467767715454,
1260
+ "eval_logits/rejected": 0.8712592124938965,
1261
+ "eval_logps/chosen": -256.8898010253906,
1262
+ "eval_logps/rejected": -233.84222412109375,
1263
+ "eval_loss": 0.00016861619951669127,
1264
+ "eval_rewards/accuracies": 0.5189999938011169,
1265
+ "eval_rewards/chosen": -0.002756227506324649,
1266
+ "eval_rewards/margins": 0.00028340137214399874,
1267
+ "eval_rewards/rejected": -0.003039628965780139,
1268
+ "eval_runtime": 412.1883,
1269
+ "eval_samples_per_second": 4.852,
1270
+ "eval_steps_per_second": 1.213,
1271
+ "step": 800
1272
+ },
1273
+ {
1274
+ "epoch": 2.59,
1275
+ "learning_rate": 1.6533526109248632e-06,
1276
+ "logits/chosen": 0.6175631284713745,
1277
+ "logits/rejected": 0.74301677942276,
1278
+ "logps/chosen": -179.27880859375,
1279
+ "logps/rejected": -123.3892822265625,
1280
+ "loss": 0.0001,
1281
+ "rewards/accuracies": 0.643750011920929,
1282
+ "rewards/chosen": 0.003877174574881792,
1283
+ "rewards/margins": 0.003652264829725027,
1284
+ "rewards/rejected": 0.00022490958508569747,
1285
+ "step": 810
1286
+ },
1287
+ {
1288
+ "epoch": 2.62,
1289
+ "learning_rate": 1.5878876255682951e-06,
1290
+ "logits/chosen": 0.6676111817359924,
1291
+ "logits/rejected": 0.7865282893180847,
1292
+ "logps/chosen": -189.4076385498047,
1293
+ "logps/rejected": -137.06271362304688,
1294
+ "loss": 0.0001,
1295
+ "rewards/accuracies": 0.6625000238418579,
1296
+ "rewards/chosen": 0.004411728121340275,
1297
+ "rewards/margins": 0.0037712506018579006,
1298
+ "rewards/rejected": 0.0006404774612747133,
1299
+ "step": 820
1300
+ },
1301
+ {
1302
+ "epoch": 2.66,
1303
+ "learning_rate": 1.5231364134440485e-06,
1304
+ "logits/chosen": 0.6061269044876099,
1305
+ "logits/rejected": 0.7513775825500488,
1306
+ "logps/chosen": -190.88760375976562,
1307
+ "logps/rejected": -134.599365234375,
1308
+ "loss": 0.0001,
1309
+ "rewards/accuracies": 0.6499999761581421,
1310
+ "rewards/chosen": 0.004417582880705595,
1311
+ "rewards/margins": 0.0038115177303552628,
1312
+ "rewards/rejected": 0.000606064626481384,
1313
+ "step": 830
1314
+ },
1315
+ {
1316
+ "epoch": 2.69,
1317
+ "learning_rate": 1.4591496455914292e-06,
1318
+ "logits/chosen": 0.6278064846992493,
1319
+ "logits/rejected": 0.7927815914154053,
1320
+ "logps/chosen": -173.77554321289062,
1321
+ "logps/rejected": -128.66485595703125,
1322
+ "loss": 0.0001,
1323
+ "rewards/accuracies": 0.637499988079071,
1324
+ "rewards/chosen": 0.003960388712584972,
1325
+ "rewards/margins": 0.0037871100939810276,
1326
+ "rewards/rejected": 0.0001732785312924534,
1327
+ "step": 840
1328
+ },
1329
+ {
1330
+ "epoch": 2.72,
1331
+ "learning_rate": 1.395977394834132e-06,
1332
+ "logits/chosen": 0.6175497174263,
1333
+ "logits/rejected": 0.8719260096549988,
1334
+ "logps/chosen": -172.75567626953125,
1335
+ "logps/rejected": -118.21751403808594,
1336
+ "loss": 0.0001,
1337
+ "rewards/accuracies": 0.65625,
1338
+ "rewards/chosen": 0.004103314597159624,
1339
+ "rewards/margins": 0.003662864211946726,
1340
+ "rewards/rejected": 0.0004404502979014069,
1341
+ "step": 850
1342
+ },
1343
+ {
1344
+ "epoch": 2.75,
1345
+ "learning_rate": 1.3336690965957733e-06,
1346
+ "logits/chosen": 0.6757524013519287,
1347
+ "logits/rejected": 0.8124428987503052,
1348
+ "logps/chosen": -192.186279296875,
1349
+ "logps/rejected": -144.07810974121094,
1350
+ "loss": 0.0001,
1351
+ "rewards/accuracies": 0.668749988079071,
1352
+ "rewards/chosen": 0.004593437071889639,
1353
+ "rewards/margins": 0.004751748405396938,
1354
+ "rewards/rejected": -0.0001583110133651644,
1355
+ "step": 860
1356
+ },
1357
+ {
1358
+ "epoch": 2.78,
1359
+ "learning_rate": 1.2722735102142192e-06,
1360
+ "logits/chosen": 0.5904192328453064,
1361
+ "logits/rejected": 0.8412498235702515,
1362
+ "logps/chosen": -178.94638061523438,
1363
+ "logps/rejected": -116.67149353027344,
1364
+ "loss": 0.0001,
1365
+ "rewards/accuracies": 0.706250011920929,
1366
+ "rewards/chosen": 0.0042373063042759895,
1367
+ "rewards/margins": 0.004430609289556742,
1368
+ "rewards/rejected": -0.00019330321811139584,
1369
+ "step": 870
1370
+ },
1371
+ {
1372
+ "epoch": 2.82,
1373
+ "learning_rate": 1.2118386807849733e-06,
1374
+ "logits/chosen": 0.6753177642822266,
1375
+ "logits/rejected": 0.8218367695808411,
1376
+ "logps/chosen": -169.46713256835938,
1377
+ "logps/rejected": -124.23944091796875,
1378
+ "loss": 0.0001,
1379
+ "rewards/accuracies": 0.6499999761581421,
1380
+ "rewards/chosen": 0.0036155576817691326,
1381
+ "rewards/margins": 0.0030525142792612314,
1382
+ "rewards/rejected": 0.0005630434607155621,
1383
+ "step": 880
1384
+ },
1385
+ {
1386
+ "epoch": 2.85,
1387
+ "learning_rate": 1.1524119015635116e-06,
1388
+ "logits/chosen": 0.5247241258621216,
1389
+ "logits/rejected": 0.7601326704025269,
1390
+ "logps/chosen": -187.26348876953125,
1391
+ "logps/rejected": -123.6650390625,
1392
+ "loss": 0.0001,
1393
+ "rewards/accuracies": 0.6499999761581421,
1394
+ "rewards/chosen": 0.004491317551583052,
1395
+ "rewards/margins": 0.003613727632910013,
1396
+ "rewards/rejected": 0.0008775900350883603,
1397
+ "step": 890
1398
+ },
1399
+ {
1400
+ "epoch": 2.88,
1401
+ "learning_rate": 1.0940396769559584e-06,
1402
+ "logits/chosen": 0.6696980595588684,
1403
+ "logits/rejected": 0.7714194059371948,
1404
+ "logps/chosen": -191.7372283935547,
1405
+ "logps/rejected": -128.82192993164062,
1406
+ "loss": 0.0001,
1407
+ "rewards/accuracies": 0.6499999761581421,
1408
+ "rewards/chosen": 0.004666435532271862,
1409
+ "rewards/margins": 0.0038504847325384617,
1410
+ "rewards/rejected": 0.0008159511489793658,
1411
+ "step": 900
1412
+ },
1413
+ {
1414
+ "epoch": 2.88,
1415
+ "eval_logits/chosen": 0.8135749101638794,
1416
+ "eval_logits/rejected": 0.8714309930801392,
1417
+ "eval_logps/chosen": -256.91107177734375,
1418
+ "eval_logps/rejected": -233.8529052734375,
1419
+ "eval_loss": 0.00016910216072574258,
1420
+ "eval_rewards/accuracies": 0.5015000104904175,
1421
+ "eval_rewards/chosen": -0.0029687141068279743,
1422
+ "eval_rewards/margins": 0.00017772088176570833,
1423
+ "eval_rewards/rejected": -0.0031464346684515476,
1424
+ "eval_runtime": 412.7326,
1425
+ "eval_samples_per_second": 4.846,
1426
+ "eval_steps_per_second": 1.211,
1427
+ "step": 900
1428
+ },
1429
+ {
1430
+ "epoch": 2.91,
1431
+ "learning_rate": 1.036767686127079e-06,
1432
+ "logits/chosen": 0.6320682168006897,
1433
+ "logits/rejected": 0.7783384323120117,
1434
+ "logps/chosen": -189.1212158203125,
1435
+ "logps/rejected": -130.10711669921875,
1436
+ "loss": 0.0001,
1437
+ "rewards/accuracies": 0.643750011920929,
1438
+ "rewards/chosen": 0.004713307600468397,
1439
+ "rewards/margins": 0.003414090257138014,
1440
+ "rewards/rejected": 0.0012992171104997396,
1441
+ "step": 910
1442
+ },
1443
+ {
1444
+ "epoch": 2.94,
1445
+ "learning_rate": 9.806407472540644e-07,
1446
+ "logits/chosen": 0.6682049036026001,
1447
+ "logits/rejected": 0.8254071474075317,
1448
+ "logps/chosen": -184.09628295898438,
1449
+ "logps/rejected": -123.4487075805664,
1450
+ "loss": 0.0001,
1451
+ "rewards/accuracies": 0.643750011920929,
1452
+ "rewards/chosen": 0.004300036001950502,
1453
+ "rewards/margins": 0.003941989503800869,
1454
+ "rewards/rejected": 0.00035804632352665067,
1455
+ "step": 920
1456
+ },
1457
+ {
1458
+ "epoch": 2.98,
1459
+ "learning_rate": 9.257027824540823e-07,
1460
+ "logits/chosen": 0.6992205381393433,
1461
+ "logits/rejected": 0.8208533525466919,
1462
+ "logps/chosen": -164.10159301757812,
1463
+ "logps/rejected": -110.07139587402344,
1464
+ "loss": 0.0001,
1465
+ "rewards/accuracies": 0.6937500238418579,
1466
+ "rewards/chosen": 0.003934869542717934,
1467
+ "rewards/margins": 0.003884183941408992,
1468
+ "rewards/rejected": 5.068551399745047e-05,
1469
+ "step": 930
1470
+ },
1471
+ {
1472
+ "epoch": 3.01,
1473
+ "learning_rate": 8.719967834130385e-07,
1474
+ "logits/chosen": 0.6116827726364136,
1475
+ "logits/rejected": 0.8160429000854492,
1476
+ "logps/chosen": -169.47280883789062,
1477
+ "logps/rejected": -115.8711166381836,
1478
+ "loss": 0.0001,
1479
+ "rewards/accuracies": 0.7437499761581421,
1480
+ "rewards/chosen": 0.005224294029176235,
1481
+ "rewards/margins": 0.004378092475235462,
1482
+ "rewards/rejected": 0.0008462019031867385,
1483
+ "step": 940
1484
+ },
1485
+ {
1486
+ "epoch": 3.04,
1487
+ "learning_rate": 8.195647777424479e-07,
1488
+ "logits/chosen": 0.5635887384414673,
1489
+ "logits/rejected": 0.7940059900283813,
1490
+ "logps/chosen": -177.22665405273438,
1491
+ "logps/rejected": -120.8065414428711,
1492
+ "loss": 0.0001,
1493
+ "rewards/accuracies": 0.6499999761581421,
1494
+ "rewards/chosen": 0.003887483151629567,
1495
+ "rewards/margins": 0.0035337016452103853,
1496
+ "rewards/rejected": 0.00035378162283450365,
1497
+ "step": 950
1498
+ },
1499
+ {
1500
+ "epoch": 3.07,
1501
+ "learning_rate": 7.684477960907422e-07,
1502
+ "logits/chosen": 0.5837413668632507,
1503
+ "logits/rejected": 0.7604612112045288,
1504
+ "logps/chosen": -175.0729217529297,
1505
+ "logps/rejected": -118.60954284667969,
1506
+ "loss": 0.0001,
1507
+ "rewards/accuracies": 0.625,
1508
+ "rewards/chosen": 0.004512741696089506,
1509
+ "rewards/margins": 0.0036391555331647396,
1510
+ "rewards/rejected": 0.0008735861629247665,
1511
+ "step": 960
1512
+ },
1513
+ {
1514
+ "epoch": 3.1,
1515
+ "learning_rate": 7.186858400347455e-07,
1516
+ "logits/chosen": 0.6265005469322205,
1517
+ "logits/rejected": 0.7885332703590393,
1518
+ "logps/chosen": -182.97769165039062,
1519
+ "logps/rejected": -138.80723571777344,
1520
+ "loss": 0.0001,
1521
+ "rewards/accuracies": 0.5874999761581421,
1522
+ "rewards/chosen": 0.004079051315784454,
1523
+ "rewards/margins": 0.004020996857434511,
1524
+ "rewards/rejected": 5.8054854889633134e-05,
1525
+ "step": 970
1526
+ },
1527
+ {
1528
+ "epoch": 3.14,
1529
+ "learning_rate": 6.703178507764618e-07,
1530
+ "logits/chosen": 0.6634309887886047,
1531
+ "logits/rejected": 0.7914069890975952,
1532
+ "logps/chosen": -167.03756713867188,
1533
+ "logps/rejected": -124.2927474975586,
1534
+ "loss": 0.0001,
1535
+ "rewards/accuracies": 0.6499999761581421,
1536
+ "rewards/chosen": 0.005084961652755737,
1537
+ "rewards/margins": 0.0035723126493394375,
1538
+ "rewards/rejected": 0.0015126490034162998,
1539
+ "step": 980
1540
+ },
1541
+ {
1542
+ "epoch": 3.17,
1543
+ "learning_rate": 6.233816786696414e-07,
1544
+ "logits/chosen": 0.6042094230651855,
1545
+ "logits/rejected": 0.7901323437690735,
1546
+ "logps/chosen": -195.58741760253906,
1547
+ "logps/rejected": -132.481689453125,
1548
+ "loss": 0.0001,
1549
+ "rewards/accuracies": 0.731249988079071,
1550
+ "rewards/chosen": 0.005596310831606388,
1551
+ "rewards/margins": 0.0045918067917227745,
1552
+ "rewards/rejected": 0.001004504389129579,
1553
+ "step": 990
1554
+ },
1555
+ {
1556
+ "epoch": 3.2,
1557
+ "learning_rate": 5.77914053600005e-07,
1558
+ "logits/chosen": 0.5928013920783997,
1559
+ "logits/rejected": 0.8330589532852173,
1560
+ "logps/chosen": -176.38113403320312,
1561
+ "logps/rejected": -123.68299865722656,
1562
+ "loss": 0.0001,
1563
+ "rewards/accuracies": 0.6625000238418579,
1564
+ "rewards/chosen": 0.0048389798030257225,
1565
+ "rewards/margins": 0.00368002662435174,
1566
+ "rewards/rejected": 0.001158953527919948,
1567
+ "step": 1000
1568
+ },
1569
+ {
1570
+ "epoch": 3.2,
1571
+ "eval_logits/chosen": 0.8156088590621948,
1572
+ "eval_logits/rejected": 0.8733118176460266,
1573
+ "eval_logps/chosen": -256.9035949707031,
1574
+ "eval_logps/rejected": -233.8666229248047,
1575
+ "eval_loss": 0.00016867661906871945,
1576
+ "eval_rewards/accuracies": 0.5180000066757202,
1577
+ "eval_rewards/chosen": -0.002894038800150156,
1578
+ "eval_rewards/margins": 0.0003897584683727473,
1579
+ "eval_rewards/rejected": -0.0032837972976267338,
1580
+ "eval_runtime": 412.3586,
1581
+ "eval_samples_per_second": 4.85,
1582
+ "eval_steps_per_second": 1.213,
1583
+ "step": 1000
1584
+ },
1585
+ {
1586
+ "epoch": 3.23,
1587
+ "learning_rate": 5.339505562422851e-07,
1588
+ "logits/chosen": 0.6899782419204712,
1589
+ "logits/rejected": 0.8187691569328308,
1590
+ "logps/chosen": -177.58436584472656,
1591
+ "logps/rejected": -121.7637939453125,
1592
+ "loss": 0.0001,
1593
+ "rewards/accuracies": 0.6187499761581421,
1594
+ "rewards/chosen": 0.0037464499473571777,
1595
+ "rewards/margins": 0.0031559974886476994,
1596
+ "rewards/rejected": 0.0005904529243707657,
1597
+ "step": 1010
1598
+ },
1599
+ {
1600
+ "epoch": 3.26,
1601
+ "learning_rate": 4.915255902165734e-07,
1602
+ "logits/chosen": 0.6754826903343201,
1603
+ "logits/rejected": 0.8096501231193542,
1604
+ "logps/chosen": -173.49542236328125,
1605
+ "logps/rejected": -120.12110900878906,
1606
+ "loss": 0.0001,
1607
+ "rewards/accuracies": 0.643750011920929,
1608
+ "rewards/chosen": 0.004369380883872509,
1609
+ "rewards/margins": 0.0036476694513112307,
1610
+ "rewards/rejected": 0.000721711665391922,
1611
+ "step": 1020
1612
+ },
1613
+ {
1614
+ "epoch": 3.3,
1615
+ "learning_rate": 4.506723551657879e-07,
1616
+ "logits/chosen": 0.5708236694335938,
1617
+ "logits/rejected": 0.7320288419723511,
1618
+ "logps/chosen": -179.21517944335938,
1619
+ "logps/rejected": -131.3833465576172,
1620
+ "loss": 0.0001,
1621
+ "rewards/accuracies": 0.65625,
1622
+ "rewards/chosen": 0.004431615583598614,
1623
+ "rewards/margins": 0.003471215022727847,
1624
+ "rewards/rejected": 0.0009604001534171402,
1625
+ "step": 1030
1626
+ },
1627
+ {
1628
+ "epoch": 3.33,
1629
+ "learning_rate": 4.11422820775299e-07,
1630
+ "logits/chosen": 0.6452796459197998,
1631
+ "logits/rejected": 0.8163026571273804,
1632
+ "logps/chosen": -205.68319702148438,
1633
+ "logps/rejected": -129.33828735351562,
1634
+ "loss": 0.0001,
1635
+ "rewards/accuracies": 0.637499988079071,
1636
+ "rewards/chosen": 0.004141085781157017,
1637
+ "rewards/margins": 0.0046521080657839775,
1638
+ "rewards/rejected": -0.0005110226338729262,
1639
+ "step": 1040
1640
+ },
1641
+ {
1642
+ "epoch": 3.36,
1643
+ "learning_rate": 3.7380770175506397e-07,
1644
+ "logits/chosen": 0.5789592862129211,
1645
+ "logits/rejected": 0.7904280424118042,
1646
+ "logps/chosen": -180.31417846679688,
1647
+ "logps/rejected": -129.52687072753906,
1648
+ "loss": 0.0001,
1649
+ "rewards/accuracies": 0.675000011920929,
1650
+ "rewards/chosen": 0.004802372306585312,
1651
+ "rewards/margins": 0.004047960974276066,
1652
+ "rewards/rejected": 0.0007544115069322288,
1653
+ "step": 1050
1654
+ },
1655
+ {
1656
+ "epoch": 3.39,
1657
+ "learning_rate": 3.3785643380384063e-07,
1658
+ "logits/chosen": 0.6884064674377441,
1659
+ "logits/rejected": 0.8517038226127625,
1660
+ "logps/chosen": -177.23248291015625,
1661
+ "logps/rejected": -117.1332778930664,
1662
+ "loss": 0.0001,
1663
+ "rewards/accuracies": 0.606249988079071,
1664
+ "rewards/chosen": 0.0034295388031750917,
1665
+ "rewards/margins": 0.0038840598426759243,
1666
+ "rewards/rejected": -0.0004545215633697808,
1667
+ "step": 1060
1668
+ },
1669
+ {
1670
+ "epoch": 3.42,
1671
+ "learning_rate": 3.0359715057429186e-07,
1672
+ "logits/chosen": 0.5767031908035278,
1673
+ "logits/rejected": 0.6466313004493713,
1674
+ "logps/chosen": -172.95346069335938,
1675
+ "logps/rejected": -127.76350402832031,
1676
+ "loss": 0.0001,
1677
+ "rewards/accuracies": 0.637499988079071,
1678
+ "rewards/chosen": 0.0043509481474757195,
1679
+ "rewards/margins": 0.0033387758303433657,
1680
+ "rewards/rejected": 0.0010121725499629974,
1681
+ "step": 1070
1682
+ },
1683
+ {
1684
+ "epoch": 3.46,
1685
+ "learning_rate": 2.710566616570048e-07,
1686
+ "logits/chosen": 0.661910891532898,
1687
+ "logits/rejected": 0.8002559542655945,
1688
+ "logps/chosen": -177.2826690673828,
1689
+ "logps/rejected": -131.19056701660156,
1690
+ "loss": 0.0001,
1691
+ "rewards/accuracies": 0.59375,
1692
+ "rewards/chosen": 0.0037956517189741135,
1693
+ "rewards/margins": 0.003371240571141243,
1694
+ "rewards/rejected": 0.0004244106821715832,
1695
+ "step": 1080
1696
+ },
1697
+ {
1698
+ "epoch": 3.49,
1699
+ "learning_rate": 2.40260431600654e-07,
1700
+ "logits/chosen": 0.6113812923431396,
1701
+ "logits/rejected": 0.8205118179321289,
1702
+ "logps/chosen": -174.49224853515625,
1703
+ "logps/rejected": -135.5587158203125,
1704
+ "loss": 0.0001,
1705
+ "rewards/accuracies": 0.574999988079071,
1706
+ "rewards/chosen": 0.004168152809143066,
1707
+ "rewards/margins": 0.0029846313409507275,
1708
+ "rewards/rejected": 0.0011835219338536263,
1709
+ "step": 1090
1710
+ },
1711
+ {
1712
+ "epoch": 3.52,
1713
+ "learning_rate": 2.1123255998472952e-07,
1714
+ "logits/chosen": 0.6319399476051331,
1715
+ "logits/rejected": 0.7784051895141602,
1716
+ "logps/chosen": -160.6739501953125,
1717
+ "logps/rejected": -114.9722671508789,
1718
+ "loss": 0.0001,
1719
+ "rewards/accuracies": 0.668749988079071,
1720
+ "rewards/chosen": 0.004197821486741304,
1721
+ "rewards/margins": 0.0031375852413475513,
1722
+ "rewards/rejected": 0.0010602364782243967,
1723
+ "step": 1100
1724
+ },
1725
+ {
1726
+ "epoch": 3.52,
1727
+ "eval_logits/chosen": 0.8145170211791992,
1728
+ "eval_logits/rejected": 0.8724321126937866,
1729
+ "eval_logps/chosen": -256.90802001953125,
1730
+ "eval_logps/rejected": -233.87786865234375,
1731
+ "eval_loss": 0.00016770198999438435,
1732
+ "eval_rewards/accuracies": 0.5264999866485596,
1733
+ "eval_rewards/chosen": -0.00293838232755661,
1734
+ "eval_rewards/margins": 0.00045752059668302536,
1735
+ "eval_rewards/rejected": -0.003395902691408992,
1736
+ "eval_runtime": 412.1486,
1737
+ "eval_samples_per_second": 4.853,
1738
+ "eval_steps_per_second": 1.213,
1739
+ "step": 1100
1740
+ },
1741
+ {
1742
+ "epoch": 3.55,
1743
+ "learning_rate": 1.8399576256041525e-07,
1744
+ "logits/chosen": 0.6978103518486023,
1745
+ "logits/rejected": 0.7642599940299988,
1746
+ "logps/chosen": -188.5946502685547,
1747
+ "logps/rejected": -142.72915649414062,
1748
+ "loss": 0.0001,
1749
+ "rewards/accuracies": 0.581250011920929,
1750
+ "rewards/chosen": 0.004109769128262997,
1751
+ "rewards/margins": 0.003021980170160532,
1752
+ "rewards/rejected": 0.0010877888416871428,
1753
+ "step": 1110
1754
+ },
1755
+ {
1756
+ "epoch": 3.58,
1757
+ "learning_rate": 1.58571353474391e-07,
1758
+ "logits/chosen": 0.6541138887405396,
1759
+ "logits/rejected": 0.7682735323905945,
1760
+ "logps/chosen": -157.97927856445312,
1761
+ "logps/rejected": -120.40803527832031,
1762
+ "loss": 0.0001,
1763
+ "rewards/accuracies": 0.5625,
1764
+ "rewards/chosen": 0.0030703709926456213,
1765
+ "rewards/margins": 0.0024266496766358614,
1766
+ "rewards/rejected": 0.0006437213160097599,
1767
+ "step": 1120
1768
+ },
1769
+ {
1770
+ "epoch": 3.62,
1771
+ "learning_rate": 1.3497922858944857e-07,
1772
+ "logits/chosen": 0.7424976229667664,
1773
+ "logits/rejected": 0.8078197240829468,
1774
+ "logps/chosen": -174.2366943359375,
1775
+ "logps/rejected": -131.4811553955078,
1776
+ "loss": 0.0001,
1777
+ "rewards/accuracies": 0.6937500238418579,
1778
+ "rewards/chosen": 0.004558051936328411,
1779
+ "rewards/margins": 0.003959262277930975,
1780
+ "rewards/rejected": 0.0005987894837744534,
1781
+ "step": 1130
1782
+ },
1783
+ {
1784
+ "epoch": 3.65,
1785
+ "learning_rate": 1.1323784991499471e-07,
1786
+ "logits/chosen": 0.6016920804977417,
1787
+ "logits/rejected": 0.7492619752883911,
1788
+ "logps/chosen": -172.1629638671875,
1789
+ "logps/rejected": -116.34178161621094,
1790
+ "loss": 0.0001,
1791
+ "rewards/accuracies": 0.6812499761581421,
1792
+ "rewards/chosen": 0.004433467984199524,
1793
+ "rewards/margins": 0.004034010227769613,
1794
+ "rewards/rejected": 0.0003994574653916061,
1795
+ "step": 1140
1796
+ },
1797
+ {
1798
+ "epoch": 3.68,
1799
+ "learning_rate": 9.336423115961002e-08,
1800
+ "logits/chosen": 0.6536157727241516,
1801
+ "logits/rejected": 0.8383282423019409,
1802
+ "logps/chosen": -179.27108764648438,
1803
+ "logps/rejected": -121.70698547363281,
1804
+ "loss": 0.0001,
1805
+ "rewards/accuracies": 0.6875,
1806
+ "rewards/chosen": 0.00442055519670248,
1807
+ "rewards/margins": 0.003978613764047623,
1808
+ "rewards/rejected": 0.00044194143265485764,
1809
+ "step": 1150
1810
+ },
1811
+ {
1812
+ "epoch": 3.71,
1813
+ "learning_rate": 7.537392441697793e-08,
1814
+ "logits/chosen": 0.6289481520652771,
1815
+ "logits/rejected": 0.8159587979316711,
1816
+ "logps/chosen": -178.74453735351562,
1817
+ "logps/rejected": -126.57807922363281,
1818
+ "loss": 0.0001,
1819
+ "rewards/accuracies": 0.6499999761581421,
1820
+ "rewards/chosen": 0.004435301758348942,
1821
+ "rewards/margins": 0.0039503672160208225,
1822
+ "rewards/rejected": 0.0004849349206779152,
1823
+ "step": 1160
1824
+ },
1825
+ {
1826
+ "epoch": 3.74,
1827
+ "learning_rate": 5.928100799559938e-08,
1828
+ "logits/chosen": 0.6689172387123108,
1829
+ "logits/rejected": 0.7863209843635559,
1830
+ "logps/chosen": -178.4945068359375,
1831
+ "logps/rejected": -124.7654037475586,
1832
+ "loss": 0.0001,
1833
+ "rewards/accuracies": 0.643750011920929,
1834
+ "rewards/chosen": 0.005153838545084,
1835
+ "rewards/margins": 0.003751266747713089,
1836
+ "rewards/rejected": 0.001402571564540267,
1837
+ "step": 1170
1838
+ },
1839
+ {
1840
+ "epoch": 3.78,
1841
+ "learning_rate": 4.5098075401815435e-08,
1842
+ "logits/chosen": 0.6192042827606201,
1843
+ "logits/rejected": 0.76641446352005,
1844
+ "logps/chosen": -171.137939453125,
1845
+ "logps/rejected": -125.4515151977539,
1846
+ "loss": 0.0001,
1847
+ "rewards/accuracies": 0.6625000238418579,
1848
+ "rewards/chosen": 0.004533619619905949,
1849
+ "rewards/margins": 0.003654058324173093,
1850
+ "rewards/rejected": 0.000879561179317534,
1851
+ "step": 1180
1852
+ },
1853
+ {
1854
+ "epoch": 3.81,
1855
+ "learning_rate": 3.283622548476445e-08,
1856
+ "logits/chosen": 0.6808220148086548,
1857
+ "logits/rejected": 0.8677403330802917,
1858
+ "logps/chosen": -178.3741455078125,
1859
+ "logps/rejected": -125.1767349243164,
1860
+ "loss": 0.0001,
1861
+ "rewards/accuracies": 0.637499988079071,
1862
+ "rewards/chosen": 0.005535735748708248,
1863
+ "rewards/margins": 0.004433406982570887,
1864
+ "rewards/rejected": 0.0011023286497220397,
1865
+ "step": 1190
1866
+ },
1867
+ {
1868
+ "epoch": 3.84,
1869
+ "learning_rate": 2.250505375098161e-08,
1870
+ "logits/chosen": 0.669333815574646,
1871
+ "logits/rejected": 0.808275043964386,
1872
+ "logps/chosen": -176.30514526367188,
1873
+ "logps/rejected": -125.28890228271484,
1874
+ "loss": 0.0001,
1875
+ "rewards/accuracies": 0.6625000238418579,
1876
+ "rewards/chosen": 0.004538280423730612,
1877
+ "rewards/margins": 0.0036300034262239933,
1878
+ "rewards/rejected": 0.0009082768228836358,
1879
+ "step": 1200
1880
+ },
1881
+ {
1882
+ "epoch": 3.84,
1883
+ "eval_logits/chosen": 0.8126665353775024,
1884
+ "eval_logits/rejected": 0.8705229759216309,
1885
+ "eval_logps/chosen": -256.92266845703125,
1886
+ "eval_logps/rejected": -233.87326049804688,
1887
+ "eval_loss": 0.00017112624482251704,
1888
+ "eval_rewards/accuracies": 0.5044999718666077,
1889
+ "eval_rewards/chosen": -0.0030848486348986626,
1890
+ "eval_rewards/margins": 0.0002651172399055213,
1891
+ "eval_rewards/rejected": -0.0033499656710773706,
1892
+ "eval_runtime": 412.2112,
1893
+ "eval_samples_per_second": 4.852,
1894
+ "eval_steps_per_second": 1.213,
1895
+ "step": 1200
1896
+ },
1897
+ {
1898
+ "epoch": 3.87,
1899
+ "learning_rate": 1.4112644855438228e-08,
1900
+ "logits/chosen": 0.6185505390167236,
1901
+ "logits/rejected": 0.786270260810852,
1902
+ "logps/chosen": -172.150634765625,
1903
+ "logps/rejected": -123.99739074707031,
1904
+ "loss": 0.0001,
1905
+ "rewards/accuracies": 0.668749988079071,
1906
+ "rewards/chosen": 0.005183863919228315,
1907
+ "rewards/margins": 0.004181054420769215,
1908
+ "rewards/rejected": 0.0010028090327978134,
1909
+ "step": 1210
1910
+ },
1911
+ {
1912
+ "epoch": 3.9,
1913
+ "learning_rate": 7.665566274897007e-09,
1914
+ "logits/chosen": 0.6319261789321899,
1915
+ "logits/rejected": 0.8173401951789856,
1916
+ "logps/chosen": -197.80917358398438,
1917
+ "logps/rejected": -135.27621459960938,
1918
+ "loss": 0.0001,
1919
+ "rewards/accuracies": 0.7250000238418579,
1920
+ "rewards/chosen": 0.005665967706590891,
1921
+ "rewards/margins": 0.005035373382270336,
1922
+ "rewards/rejected": 0.0006305938586592674,
1923
+ "step": 1220
1924
+ },
1925
+ {
1926
+ "epoch": 3.94,
1927
+ "learning_rate": 3.1688631685364292e-09,
1928
+ "logits/chosen": 0.5452470183372498,
1929
+ "logits/rejected": 0.7646620869636536,
1930
+ "logps/chosen": -184.90304565429688,
1931
+ "logps/rejected": -128.32806396484375,
1932
+ "loss": 0.0001,
1933
+ "rewards/accuracies": 0.643750011920929,
1934
+ "rewards/chosen": 0.0048486096784472466,
1935
+ "rewards/margins": 0.004199582617729902,
1936
+ "rewards/rejected": 0.0006490271771326661,
1937
+ "step": 1230
1938
+ },
1939
+ {
1940
+ "epoch": 3.97,
1941
+ "learning_rate": 6.260544298619664e-10,
1942
+ "logits/chosen": 0.6011223793029785,
1943
+ "logits/rejected": 0.7510023713111877,
1944
+ "logps/chosen": -178.15272521972656,
1945
+ "logps/rejected": -124.17759704589844,
1946
+ "loss": 0.0001,
1947
+ "rewards/accuracies": 0.612500011920929,
1948
+ "rewards/chosen": 0.0051615191623568535,
1949
+ "rewards/margins": 0.003729583229869604,
1950
+ "rewards/rejected": 0.0014319362817332149,
1951
+ "step": 1240
1952
+ },
1953
+ {
1954
+ "epoch": 3.99,
1955
+ "step": 1248,
1956
+ "total_flos": 0.0,
1957
+ "train_loss": 9.35627439654733e-05,
1958
+ "train_runtime": 14053.8323,
1959
+ "train_samples_per_second": 1.423,
1960
+ "train_steps_per_second": 0.089
1961
+ }
1962
+ ],
1963
+ "logging_steps": 10,
1964
+ "max_steps": 1248,
1965
+ "num_input_tokens_seen": 0,
1966
+ "num_train_epochs": 4,
1967
+ "save_steps": 100,
1968
+ "total_flos": 0.0,
1969
+ "train_batch_size": 4,
1970
+ "trial_name": null,
1971
+ "trial_params": null
1972
+ }