BraylonDash commited on
Commit
b611c76
1 Parent(s): 30a8a30

Model save

Browse files
README.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ base_model: microsoft/phi-2
9
+ model-index:
10
+ - name: phi-2-gpo-renew2-b0.001-extra-i1
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # phi-2-gpo-renew2-b0.001-extra-i1
18
+
19
+ This model is a fine-tuned version of [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) on the None dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.0421
22
+ - Rewards/chosen: -0.0100
23
+ - Rewards/rejected: -0.0414
24
+ - Rewards/accuracies: 0.6015
25
+ - Rewards/margins: 0.0314
26
+ - Logps/rejected: -408.6569
27
+ - Logps/chosen: -406.3272
28
+ - Logits/rejected: -1.0065
29
+ - Logits/chosen: -1.0521
30
+
31
+ ## Model description
32
+
33
+ More information needed
34
+
35
+ ## Intended uses & limitations
36
+
37
+ More information needed
38
+
39
+ ## Training and evaluation data
40
+
41
+ More information needed
42
+
43
+ ## Training procedure
44
+
45
+ ### Training hyperparameters
46
+
47
+ The following hyperparameters were used during training:
48
+ - learning_rate: 5e-06
49
+ - train_batch_size: 4
50
+ - eval_batch_size: 4
51
+ - seed: 42
52
+ - distributed_type: multi-GPU
53
+ - gradient_accumulation_steps: 4
54
+ - total_train_batch_size: 16
55
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
56
+ - lr_scheduler_type: cosine
57
+ - lr_scheduler_warmup_ratio: 0.1
58
+ - num_epochs: 1
59
+
60
+ ### Training results
61
+
62
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
63
+ |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
64
+ | 0.0982 | 0.11 | 100 | 0.0526 | -0.0081 | -0.0101 | 0.5190 | 0.0020 | -377.3773 | -404.4459 | -0.7816 | -0.8697 |
65
+ | 0.0846 | 0.21 | 200 | 0.0485 | -0.0265 | -0.0402 | 0.5530 | 0.0137 | -407.4654 | -422.8199 | -0.9792 | -1.0427 |
66
+ | 0.0859 | 0.32 | 300 | 0.0464 | -0.0257 | -0.0460 | 0.5725 | 0.0203 | -413.2813 | -422.0490 | -1.0612 | -1.1154 |
67
+ | 0.0957 | 0.43 | 400 | 0.0443 | -0.0207 | -0.0481 | 0.5780 | 0.0274 | -415.3487 | -417.0023 | -1.0450 | -1.0984 |
68
+ | 0.068 | 0.53 | 500 | 0.0432 | -0.0067 | -0.0318 | 0.5955 | 0.0252 | -399.0811 | -402.9732 | -0.9791 | -1.0329 |
69
+ | 0.0847 | 0.64 | 600 | 0.0427 | -0.0050 | -0.0312 | 0.5945 | 0.0263 | -398.4744 | -401.2879 | -0.9837 | -1.0364 |
70
+ | 0.0519 | 0.75 | 700 | 0.0423 | -0.0082 | -0.0377 | 0.5905 | 0.0295 | -404.9791 | -404.5331 | -0.9872 | -1.0360 |
71
+ | 0.0742 | 0.85 | 800 | 0.0422 | -0.0105 | -0.0420 | 0.6000 | 0.0315 | -409.2462 | -406.8035 | -1.0109 | -1.0556 |
72
+ | 0.0768 | 0.96 | 900 | 0.0421 | -0.0100 | -0.0415 | 0.5930 | 0.0315 | -408.7397 | -406.3475 | -1.0050 | -1.0502 |
73
+
74
+
75
+ ### Framework versions
76
+
77
+ - PEFT 0.7.1
78
+ - Transformers 4.36.2
79
+ - Pytorch 2.1.2
80
+ - Datasets 2.14.6
81
+ - Tokenizers 0.15.2
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd1de0433ea447017eaf3c7483d9bc52d9194afb9001dc0dd6cb5f840468ab7c
3
  size 167807296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bea97341984a3929c333752315ab1030a4371485fe4b2ca7a0a42d1472d664b2
3
  size 167807296
all_results.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_logits/chosen": -1.0520579814910889,
4
+ "eval_logits/rejected": -1.0065128803253174,
5
+ "eval_logps/chosen": -406.3271789550781,
6
+ "eval_logps/rejected": -408.65692138671875,
7
+ "eval_loss": 0.042073994874954224,
8
+ "eval_rewards/accuracies": 0.6014999747276306,
9
+ "eval_rewards/chosen": -0.010021946392953396,
10
+ "eval_rewards/margins": 0.031399574130773544,
11
+ "eval_rewards/rejected": -0.041421521455049515,
12
+ "eval_runtime": 546.029,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 3.663,
15
+ "eval_steps_per_second": 0.916,
16
+ "train_loss": 0.07849642387894454,
17
+ "train_runtime": 13138.6455,
18
+ "train_samples": 61135,
19
+ "train_samples_per_second": 1.142,
20
+ "train_steps_per_second": 0.071
21
+ }
eval_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_logits/chosen": -1.0520579814910889,
4
+ "eval_logits/rejected": -1.0065128803253174,
5
+ "eval_logps/chosen": -406.3271789550781,
6
+ "eval_logps/rejected": -408.65692138671875,
7
+ "eval_loss": 0.042073994874954224,
8
+ "eval_rewards/accuracies": 0.6014999747276306,
9
+ "eval_rewards/chosen": -0.010021946392953396,
10
+ "eval_rewards/margins": 0.031399574130773544,
11
+ "eval_rewards/rejected": -0.041421521455049515,
12
+ "eval_runtime": 546.029,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 3.663,
15
+ "eval_steps_per_second": 0.916
16
+ }
runs/Apr23_11-08-08_gpu4-119-5/events.out.tfevents.1713834594.gpu4-119-5.3428264.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86f210e86213e5891437026003601dbf64fd8766b6c797dc6a6c253c2c8b68fd
3
- size 69060
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e719adeb13bb3954f77ccc64602132f29702cd2852fc88295977a44c7d81ef3c
3
+ size 71316
runs/Apr23_11-08-08_gpu4-119-5/events.out.tfevents.1713848278.gpu4-119-5.3428264.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdb089c71d68fb97ba399c1461d4325cf2b5c931770f84dbe8b6a6445a9673da
3
+ size 828
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.07849642387894454,
4
+ "train_runtime": 13138.6455,
5
+ "train_samples": 61135,
6
+ "train_samples_per_second": 1.142,
7
+ "train_steps_per_second": 0.071
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9994666666666666,
5
+ "eval_steps": 100,
6
+ "global_step": 937,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 5.319148936170213e-08,
14
+ "logits/chosen": -0.5045956373214722,
15
+ "logits/rejected": -0.805889368057251,
16
+ "logps/chosen": -165.41160583496094,
17
+ "logps/rejected": -172.8127899169922,
18
+ "loss": 0.0848,
19
+ "rewards/accuracies": 0.0,
20
+ "rewards/chosen": 0.0,
21
+ "rewards/margins": 0.0,
22
+ "rewards/rejected": 0.0,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 0.01,
27
+ "learning_rate": 5.319148936170213e-07,
28
+ "logits/chosen": -0.851473867893219,
29
+ "logits/rejected": -0.8214991092681885,
30
+ "logps/chosen": -258.1239013671875,
31
+ "logps/rejected": -255.48716735839844,
32
+ "loss": 0.0877,
33
+ "rewards/accuracies": 0.2986111044883728,
34
+ "rewards/chosen": 0.0002587677154224366,
35
+ "rewards/margins": 0.00023072944895830005,
36
+ "rewards/rejected": 2.803823554131668e-05,
37
+ "step": 10
38
+ },
39
+ {
40
+ "epoch": 0.02,
41
+ "learning_rate": 1.0638297872340427e-06,
42
+ "logits/chosen": -0.8987849354743958,
43
+ "logits/rejected": -0.7349363565444946,
44
+ "logps/chosen": -260.9398193359375,
45
+ "logps/rejected": -253.32925415039062,
46
+ "loss": 0.0893,
47
+ "rewards/accuracies": 0.29374998807907104,
48
+ "rewards/chosen": -0.000219681765884161,
49
+ "rewards/margins": -3.2768032269814285e-06,
50
+ "rewards/rejected": -0.00021640490740537643,
51
+ "step": 20
52
+ },
53
+ {
54
+ "epoch": 0.03,
55
+ "learning_rate": 1.595744680851064e-06,
56
+ "logits/chosen": -0.9162956476211548,
57
+ "logits/rejected": -0.7800331115722656,
58
+ "logps/chosen": -240.79800415039062,
59
+ "logps/rejected": -235.59182739257812,
60
+ "loss": 0.0783,
61
+ "rewards/accuracies": 0.2874999940395355,
62
+ "rewards/chosen": -0.00024258208577521145,
63
+ "rewards/margins": 0.00012204260565340519,
64
+ "rewards/rejected": -0.0003646246623247862,
65
+ "step": 30
66
+ },
67
+ {
68
+ "epoch": 0.04,
69
+ "learning_rate": 2.1276595744680853e-06,
70
+ "logits/chosen": -0.8354488611221313,
71
+ "logits/rejected": -0.8405616879463196,
72
+ "logps/chosen": -255.01931762695312,
73
+ "logps/rejected": -224.09188842773438,
74
+ "loss": 0.0749,
75
+ "rewards/accuracies": 0.30000001192092896,
76
+ "rewards/chosen": 0.0001305304904235527,
77
+ "rewards/margins": 0.0002960737328976393,
78
+ "rewards/rejected": -0.0001655431988183409,
79
+ "step": 40
80
+ },
81
+ {
82
+ "epoch": 0.05,
83
+ "learning_rate": 2.6595744680851065e-06,
84
+ "logits/chosen": -0.9115155935287476,
85
+ "logits/rejected": -0.7566107511520386,
86
+ "logps/chosen": -295.87884521484375,
87
+ "logps/rejected": -261.06951904296875,
88
+ "loss": 0.0713,
89
+ "rewards/accuracies": 0.2874999940395355,
90
+ "rewards/chosen": -0.0011757513275370002,
91
+ "rewards/margins": -0.0006419935962185264,
92
+ "rewards/rejected": -0.0005337577313184738,
93
+ "step": 50
94
+ },
95
+ {
96
+ "epoch": 0.06,
97
+ "learning_rate": 3.191489361702128e-06,
98
+ "logits/chosen": -0.9281567335128784,
99
+ "logits/rejected": -0.8129026293754578,
100
+ "logps/chosen": -261.63751220703125,
101
+ "logps/rejected": -261.89483642578125,
102
+ "loss": 0.0779,
103
+ "rewards/accuracies": 0.32499998807907104,
104
+ "rewards/chosen": -0.000994718400761485,
105
+ "rewards/margins": 0.00043715062201954424,
106
+ "rewards/rejected": -0.0014318691100925207,
107
+ "step": 60
108
+ },
109
+ {
110
+ "epoch": 0.07,
111
+ "learning_rate": 3.723404255319149e-06,
112
+ "logits/chosen": -0.8597652316093445,
113
+ "logits/rejected": -0.8151811361312866,
114
+ "logps/chosen": -271.51458740234375,
115
+ "logps/rejected": -241.4061279296875,
116
+ "loss": 0.0898,
117
+ "rewards/accuracies": 0.3062500059604645,
118
+ "rewards/chosen": -0.0014379310887306929,
119
+ "rewards/margins": 0.0002869053860194981,
120
+ "rewards/rejected": -0.0017248367657884955,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.09,
125
+ "learning_rate": 4.255319148936171e-06,
126
+ "logits/chosen": -0.8520501255989075,
127
+ "logits/rejected": -0.811953067779541,
128
+ "logps/chosen": -311.61431884765625,
129
+ "logps/rejected": -305.77520751953125,
130
+ "loss": 0.0795,
131
+ "rewards/accuracies": 0.33125001192092896,
132
+ "rewards/chosen": -0.0034166008699685335,
133
+ "rewards/margins": 0.00010135892080143094,
134
+ "rewards/rejected": -0.003517959965392947,
135
+ "step": 80
136
+ },
137
+ {
138
+ "epoch": 0.1,
139
+ "learning_rate": 4.787234042553192e-06,
140
+ "logits/chosen": -0.9568193554878235,
141
+ "logits/rejected": -0.8735030293464661,
142
+ "logps/chosen": -277.09405517578125,
143
+ "logps/rejected": -237.3052978515625,
144
+ "loss": 0.0831,
145
+ "rewards/accuracies": 0.3687500059604645,
146
+ "rewards/chosen": -0.0037826255429536104,
147
+ "rewards/margins": 0.0007604987476952374,
148
+ "rewards/rejected": -0.0045431237667799,
149
+ "step": 90
150
+ },
151
+ {
152
+ "epoch": 0.11,
153
+ "learning_rate": 4.999375059004058e-06,
154
+ "logits/chosen": -0.8838983774185181,
155
+ "logits/rejected": -0.8245723843574524,
156
+ "logps/chosen": -274.2312927246094,
157
+ "logps/rejected": -233.004638671875,
158
+ "loss": 0.0982,
159
+ "rewards/accuracies": 0.375,
160
+ "rewards/chosen": -0.004330983851104975,
161
+ "rewards/margins": 0.0017646064516156912,
162
+ "rewards/rejected": -0.00609559053555131,
163
+ "step": 100
164
+ },
165
+ {
166
+ "epoch": 0.11,
167
+ "eval_logits/chosen": -0.8696709871292114,
168
+ "eval_logits/rejected": -0.7816442847251892,
169
+ "eval_logps/chosen": -404.4459228515625,
170
+ "eval_logps/rejected": -377.37725830078125,
171
+ "eval_loss": 0.05261076241731644,
172
+ "eval_rewards/accuracies": 0.5189999938011169,
173
+ "eval_rewards/chosen": -0.008140643127262592,
174
+ "eval_rewards/margins": 0.0020011626183986664,
175
+ "eval_rewards/rejected": -0.010141806676983833,
176
+ "eval_runtime": 545.9504,
177
+ "eval_samples_per_second": 3.663,
178
+ "eval_steps_per_second": 0.916,
179
+ "step": 100
180
+ },
181
+ {
182
+ "epoch": 0.12,
183
+ "learning_rate": 4.9955571065548795e-06,
184
+ "logits/chosen": -0.9740250706672668,
185
+ "logits/rejected": -0.8206865191459656,
186
+ "logps/chosen": -308.79986572265625,
187
+ "logps/rejected": -279.56817626953125,
188
+ "loss": 0.0729,
189
+ "rewards/accuracies": 0.3499999940395355,
190
+ "rewards/chosen": -0.006666774861514568,
191
+ "rewards/margins": 0.001093443250283599,
192
+ "rewards/rejected": -0.0077602192759513855,
193
+ "step": 110
194
+ },
195
+ {
196
+ "epoch": 0.13,
197
+ "learning_rate": 4.9882736864879e-06,
198
+ "logits/chosen": -0.9458662271499634,
199
+ "logits/rejected": -0.8622045516967773,
200
+ "logps/chosen": -269.64190673828125,
201
+ "logps/rejected": -255.1685028076172,
202
+ "loss": 0.0678,
203
+ "rewards/accuracies": 0.36250001192092896,
204
+ "rewards/chosen": -0.006551130209118128,
205
+ "rewards/margins": 0.0029696193523705006,
206
+ "rewards/rejected": -0.009520749561488628,
207
+ "step": 120
208
+ },
209
+ {
210
+ "epoch": 0.14,
211
+ "learning_rate": 4.977534912960124e-06,
212
+ "logits/chosen": -1.0262787342071533,
213
+ "logits/rejected": -0.9416742324829102,
214
+ "logps/chosen": -228.7926788330078,
215
+ "logps/rejected": -229.67898559570312,
216
+ "loss": 0.0929,
217
+ "rewards/accuracies": 0.32499998807907104,
218
+ "rewards/chosen": -0.008367964997887611,
219
+ "rewards/margins": 0.0024298636708408594,
220
+ "rewards/rejected": -0.010797828435897827,
221
+ "step": 130
222
+ },
223
+ {
224
+ "epoch": 0.15,
225
+ "learning_rate": 4.963355698422092e-06,
226
+ "logits/chosen": -0.9834293127059937,
227
+ "logits/rejected": -0.9608744382858276,
228
+ "logps/chosen": -244.4986572265625,
229
+ "logps/rejected": -238.37118530273438,
230
+ "loss": 0.092,
231
+ "rewards/accuracies": 0.30000001192092896,
232
+ "rewards/chosen": -0.010878035798668861,
233
+ "rewards/margins": 0.003188747214153409,
234
+ "rewards/rejected": -0.014066783711314201,
235
+ "step": 140
236
+ },
237
+ {
238
+ "epoch": 0.16,
239
+ "learning_rate": 4.945755732909625e-06,
240
+ "logits/chosen": -1.0271694660186768,
241
+ "logits/rejected": -0.8658772706985474,
242
+ "logps/chosen": -303.09539794921875,
243
+ "logps/rejected": -265.5880126953125,
244
+ "loss": 0.0767,
245
+ "rewards/accuracies": 0.42500001192092896,
246
+ "rewards/chosen": -0.013803419657051563,
247
+ "rewards/margins": 0.005266121588647366,
248
+ "rewards/rejected": -0.01906954124569893,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.17,
253
+ "learning_rate": 4.924759456701167e-06,
254
+ "logits/chosen": -1.1086270809173584,
255
+ "logits/rejected": -1.041982650756836,
256
+ "logps/chosen": -317.58245849609375,
257
+ "logps/rejected": -280.8768310546875,
258
+ "loss": 0.0902,
259
+ "rewards/accuracies": 0.39375001192092896,
260
+ "rewards/chosen": -0.021537428721785545,
261
+ "rewards/margins": 0.0052458057180047035,
262
+ "rewards/rejected": -0.026783233508467674,
263
+ "step": 160
264
+ },
265
+ {
266
+ "epoch": 0.18,
267
+ "learning_rate": 4.900396026378671e-06,
268
+ "logits/chosen": -1.1098581552505493,
269
+ "logits/rejected": -0.9803470373153687,
270
+ "logps/chosen": -335.6375427246094,
271
+ "logps/rejected": -316.95733642578125,
272
+ "loss": 0.0536,
273
+ "rewards/accuracies": 0.3812499940395355,
274
+ "rewards/chosen": -0.026775449514389038,
275
+ "rewards/margins": 0.008162637241184711,
276
+ "rewards/rejected": -0.03493808954954147,
277
+ "step": 170
278
+ },
279
+ {
280
+ "epoch": 0.19,
281
+ "learning_rate": 4.872699274339169e-06,
282
+ "logits/chosen": -1.0897270441055298,
283
+ "logits/rejected": -1.001300573348999,
284
+ "logps/chosen": -296.06353759765625,
285
+ "logps/rejected": -265.63751220703125,
286
+ "loss": 0.0746,
287
+ "rewards/accuracies": 0.3687500059604645,
288
+ "rewards/chosen": -0.022861812263727188,
289
+ "rewards/margins": 0.005839685909450054,
290
+ "rewards/rejected": -0.02870149537920952,
291
+ "step": 180
292
+ },
293
+ {
294
+ "epoch": 0.2,
295
+ "learning_rate": 4.8417076618132434e-06,
296
+ "logits/chosen": -1.0900896787643433,
297
+ "logits/rejected": -1.0583564043045044,
298
+ "logps/chosen": -266.21160888671875,
299
+ "logps/rejected": -245.45376586914062,
300
+ "loss": 0.0942,
301
+ "rewards/accuracies": 0.2874999940395355,
302
+ "rewards/chosen": -0.018833670765161514,
303
+ "rewards/margins": 0.00439481670036912,
304
+ "rewards/rejected": -0.023228485137224197,
305
+ "step": 190
306
+ },
307
+ {
308
+ "epoch": 0.21,
309
+ "learning_rate": 4.807464225455655e-06,
310
+ "logits/chosen": -1.0679035186767578,
311
+ "logits/rejected": -1.003225564956665,
312
+ "logps/chosen": -345.30181884765625,
313
+ "logps/rejected": -323.2543640136719,
314
+ "loss": 0.0846,
315
+ "rewards/accuracies": 0.39375001192092896,
316
+ "rewards/chosen": -0.02550928294658661,
317
+ "rewards/margins": 0.003808406414464116,
318
+ "rewards/rejected": -0.029317688196897507,
319
+ "step": 200
320
+ },
321
+ {
322
+ "epoch": 0.21,
323
+ "eval_logits/chosen": -1.0426863431930542,
324
+ "eval_logits/rejected": -0.9791997671127319,
325
+ "eval_logps/chosen": -422.8199462890625,
326
+ "eval_logps/rejected": -407.46539306640625,
327
+ "eval_loss": 0.048517368733882904,
328
+ "eval_rewards/accuracies": 0.5529999732971191,
329
+ "eval_rewards/chosen": -0.026514720171689987,
330
+ "eval_rewards/margins": 0.013715260662138462,
331
+ "eval_rewards/rejected": -0.040229979902505875,
332
+ "eval_runtime": 545.8919,
333
+ "eval_samples_per_second": 3.664,
334
+ "eval_steps_per_second": 0.916,
335
+ "step": 200
336
+ },
337
+ {
338
+ "epoch": 0.22,
339
+ "learning_rate": 4.770016517582283e-06,
340
+ "logits/chosen": -1.0943100452423096,
341
+ "logits/rejected": -1.0767104625701904,
342
+ "logps/chosen": -325.3826904296875,
343
+ "logps/rejected": -329.48663330078125,
344
+ "loss": 0.0676,
345
+ "rewards/accuracies": 0.45625001192092896,
346
+ "rewards/chosen": -0.018910493701696396,
347
+ "rewards/margins": 0.013479876331984997,
348
+ "rewards/rejected": -0.032390374690294266,
349
+ "step": 210
350
+ },
351
+ {
352
+ "epoch": 0.23,
353
+ "learning_rate": 4.7294165401363616e-06,
354
+ "logits/chosen": -1.0735843181610107,
355
+ "logits/rejected": -1.0250236988067627,
356
+ "logps/chosen": -313.1111145019531,
357
+ "logps/rejected": -292.7440490722656,
358
+ "loss": 0.0745,
359
+ "rewards/accuracies": 0.3812499940395355,
360
+ "rewards/chosen": -0.018653307110071182,
361
+ "rewards/margins": 0.01024434994906187,
362
+ "rewards/rejected": -0.028897657990455627,
363
+ "step": 220
364
+ },
365
+ {
366
+ "epoch": 0.25,
367
+ "learning_rate": 4.68572067247573e-06,
368
+ "logits/chosen": -1.1854560375213623,
369
+ "logits/rejected": -1.0616133213043213,
370
+ "logps/chosen": -281.5065612792969,
371
+ "logps/rejected": -277.0508728027344,
372
+ "loss": 0.0699,
373
+ "rewards/accuracies": 0.4000000059604645,
374
+ "rewards/chosen": -0.02585085853934288,
375
+ "rewards/margins": 0.01193526666611433,
376
+ "rewards/rejected": -0.037786126136779785,
377
+ "step": 230
378
+ },
379
+ {
380
+ "epoch": 0.26,
381
+ "learning_rate": 4.638989593081364e-06,
382
+ "logits/chosen": -1.1488720178604126,
383
+ "logits/rejected": -1.0367449522018433,
384
+ "logps/chosen": -283.6094970703125,
385
+ "logps/rejected": -252.0610809326172,
386
+ "loss": 0.0894,
387
+ "rewards/accuracies": 0.28125,
388
+ "rewards/chosen": -0.020607244223356247,
389
+ "rewards/margins": 0.006351941730827093,
390
+ "rewards/rejected": -0.026959186419844627,
391
+ "step": 240
392
+ },
393
+ {
394
+ "epoch": 0.27,
395
+ "learning_rate": 4.5892881952959015e-06,
396
+ "logits/chosen": -1.1442514657974243,
397
+ "logits/rejected": -1.1018245220184326,
398
+ "logps/chosen": -303.63458251953125,
399
+ "logps/rejected": -305.7111511230469,
400
+ "loss": 0.0878,
401
+ "rewards/accuracies": 0.4000000059604645,
402
+ "rewards/chosen": -0.02438071370124817,
403
+ "rewards/margins": 0.009189085103571415,
404
+ "rewards/rejected": -0.03356979787349701,
405
+ "step": 250
406
+ },
407
+ {
408
+ "epoch": 0.28,
409
+ "learning_rate": 4.536685497209182e-06,
410
+ "logits/chosen": -1.134932041168213,
411
+ "logits/rejected": -1.106890082359314,
412
+ "logps/chosen": -300.16815185546875,
413
+ "logps/rejected": -284.40765380859375,
414
+ "loss": 0.0921,
415
+ "rewards/accuracies": 0.3812499940395355,
416
+ "rewards/chosen": -0.025658372789621353,
417
+ "rewards/margins": 0.005949888378381729,
418
+ "rewards/rejected": -0.03160826116800308,
419
+ "step": 260
420
+ },
421
+ {
422
+ "epoch": 0.29,
423
+ "learning_rate": 4.481254545815943e-06,
424
+ "logits/chosen": -1.1486608982086182,
425
+ "logits/rejected": -1.0326900482177734,
426
+ "logps/chosen": -294.9302673339844,
427
+ "logps/rejected": -285.3538513183594,
428
+ "loss": 0.0814,
429
+ "rewards/accuracies": 0.36250001192092896,
430
+ "rewards/chosen": -0.022086424753069878,
431
+ "rewards/margins": 0.010328343138098717,
432
+ "rewards/rejected": -0.03241477161645889,
433
+ "step": 270
434
+ },
435
+ {
436
+ "epoch": 0.3,
437
+ "learning_rate": 4.42307231557875e-06,
438
+ "logits/chosen": -1.121669888496399,
439
+ "logits/rejected": -1.0683071613311768,
440
+ "logps/chosen": -310.044189453125,
441
+ "logps/rejected": -306.346435546875,
442
+ "loss": 0.0916,
443
+ "rewards/accuracies": 0.4000000059604645,
444
+ "rewards/chosen": -0.02136383019387722,
445
+ "rewards/margins": 0.012828357517719269,
446
+ "rewards/rejected": -0.03419218957424164,
447
+ "step": 280
448
+ },
449
+ {
450
+ "epoch": 0.31,
451
+ "learning_rate": 4.3622196015370305e-06,
452
+ "logits/chosen": -1.1434075832366943,
453
+ "logits/rejected": -1.1294233798980713,
454
+ "logps/chosen": -259.6036682128906,
455
+ "logps/rejected": -267.0892333984375,
456
+ "loss": 0.0856,
457
+ "rewards/accuracies": 0.3375000059604645,
458
+ "rewards/chosen": -0.01935209520161152,
459
+ "rewards/margins": 0.008529379032552242,
460
+ "rewards/rejected": -0.027881473302841187,
461
+ "step": 290
462
+ },
463
+ {
464
+ "epoch": 0.32,
465
+ "learning_rate": 4.298780907110648e-06,
466
+ "logits/chosen": -1.214237928390503,
467
+ "logits/rejected": -1.1189312934875488,
468
+ "logps/chosen": -250.4637451171875,
469
+ "logps/rejected": -255.03079223632812,
470
+ "loss": 0.0859,
471
+ "rewards/accuracies": 0.35624998807907104,
472
+ "rewards/chosen": -0.014152693562209606,
473
+ "rewards/margins": 0.011050628498196602,
474
+ "rewards/rejected": -0.025203322991728783,
475
+ "step": 300
476
+ },
477
+ {
478
+ "epoch": 0.32,
479
+ "eval_logits/chosen": -1.11543869972229,
480
+ "eval_logits/rejected": -1.0612365007400513,
481
+ "eval_logps/chosen": -422.0489807128906,
482
+ "eval_logps/rejected": -413.28131103515625,
483
+ "eval_loss": 0.046408262103796005,
484
+ "eval_rewards/accuracies": 0.5724999904632568,
485
+ "eval_rewards/chosen": -0.02574371173977852,
486
+ "eval_rewards/margins": 0.02030220627784729,
487
+ "eval_rewards/rejected": -0.04604591801762581,
488
+ "eval_runtime": 546.0923,
489
+ "eval_samples_per_second": 3.662,
490
+ "eval_steps_per_second": 0.916,
491
+ "step": 300
492
+ },
493
+ {
494
+ "epoch": 0.33,
495
+ "learning_rate": 4.23284432675381e-06,
496
+ "logits/chosen": -1.1313543319702148,
497
+ "logits/rejected": -1.0724719762802124,
498
+ "logps/chosen": -262.6627502441406,
499
+ "logps/rejected": -277.54632568359375,
500
+ "loss": 0.0916,
501
+ "rewards/accuracies": 0.38749998807907104,
502
+ "rewards/chosen": -0.019768275320529938,
503
+ "rewards/margins": 0.014759126119315624,
504
+ "rewards/rejected": -0.03452740237116814,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 0.34,
509
+ "learning_rate": 4.164501423622277e-06,
510
+ "logits/chosen": -1.246671199798584,
511
+ "logits/rejected": -1.1658015251159668,
512
+ "logps/chosen": -264.7757873535156,
513
+ "logps/rejected": -277.6180114746094,
514
+ "loss": 0.0774,
515
+ "rewards/accuracies": 0.33125001192092896,
516
+ "rewards/chosen": -0.020890336483716965,
517
+ "rewards/margins": 0.011497320607304573,
518
+ "rewards/rejected": -0.03238765895366669,
519
+ "step": 320
520
+ },
521
+ {
522
+ "epoch": 0.35,
523
+ "learning_rate": 4.0938471024237355e-06,
524
+ "logits/chosen": -1.222037672996521,
525
+ "logits/rejected": -1.156553030014038,
526
+ "logps/chosen": -256.74359130859375,
527
+ "logps/rejected": -277.230224609375,
528
+ "loss": 0.0805,
529
+ "rewards/accuracies": 0.4124999940395355,
530
+ "rewards/chosen": -0.015277216210961342,
531
+ "rewards/margins": 0.02550993300974369,
532
+ "rewards/rejected": -0.04078715294599533,
533
+ "step": 330
534
+ },
535
+ {
536
+ "epoch": 0.36,
537
+ "learning_rate": 4.020979477627907e-06,
538
+ "logits/chosen": -1.1742956638336182,
539
+ "logits/rejected": -1.1521165370941162,
540
+ "logps/chosen": -239.5863494873047,
541
+ "logps/rejected": -247.32522583007812,
542
+ "loss": 0.1015,
543
+ "rewards/accuracies": 0.3687500059604645,
544
+ "rewards/chosen": -0.01359327882528305,
545
+ "rewards/margins": 0.015490619465708733,
546
+ "rewards/rejected": -0.029083898290991783,
547
+ "step": 340
548
+ },
549
+ {
550
+ "epoch": 0.37,
551
+ "learning_rate": 3.9459997372194105e-06,
552
+ "logits/chosen": -1.1355948448181152,
553
+ "logits/rejected": -1.0996363162994385,
554
+ "logps/chosen": -268.62945556640625,
555
+ "logps/rejected": -278.85870361328125,
556
+ "loss": 0.0786,
557
+ "rewards/accuracies": 0.35624998807907104,
558
+ "rewards/chosen": -0.013594739139080048,
559
+ "rewards/margins": 0.00844600610435009,
560
+ "rewards/rejected": -0.02204074338078499,
561
+ "step": 350
562
+ },
563
+ {
564
+ "epoch": 0.38,
565
+ "learning_rate": 3.869012002182573e-06,
566
+ "logits/chosen": -1.2170326709747314,
567
+ "logits/rejected": -1.1180956363677979,
568
+ "logps/chosen": -265.72393798828125,
569
+ "logps/rejected": -233.3331298828125,
570
+ "loss": 0.0879,
571
+ "rewards/accuracies": 0.3812499940395355,
572
+ "rewards/chosen": -0.01117833610624075,
573
+ "rewards/margins": 0.015456246212124825,
574
+ "rewards/rejected": -0.02663458324968815,
575
+ "step": 360
576
+ },
577
+ {
578
+ "epoch": 0.39,
579
+ "learning_rate": 3.7901231819133104e-06,
580
+ "logits/chosen": -1.1366071701049805,
581
+ "logits/rejected": -1.0916543006896973,
582
+ "logps/chosen": -266.32037353515625,
583
+ "logps/rejected": -267.02313232421875,
584
+ "loss": 0.0726,
585
+ "rewards/accuracies": 0.4124999940395355,
586
+ "rewards/chosen": -0.01092919148504734,
587
+ "rewards/margins": 0.01567809283733368,
588
+ "rewards/rejected": -0.02660728432238102,
589
+ "step": 370
590
+ },
591
+ {
592
+ "epoch": 0.41,
593
+ "learning_rate": 3.709442825758875e-06,
594
+ "logits/chosen": -1.089163064956665,
595
+ "logits/rejected": -1.0779684782028198,
596
+ "logps/chosen": -259.6034240722656,
597
+ "logps/rejected": -261.5791015625,
598
+ "loss": 0.0859,
599
+ "rewards/accuracies": 0.41874998807907104,
600
+ "rewards/chosen": -0.008027950301766396,
601
+ "rewards/margins": 0.02001611702144146,
602
+ "rewards/rejected": -0.028044065460562706,
603
+ "step": 380
604
+ },
605
+ {
606
+ "epoch": 0.42,
607
+ "learning_rate": 3.6270829708916113e-06,
608
+ "logits/chosen": -1.0756354331970215,
609
+ "logits/rejected": -1.0809965133666992,
610
+ "logps/chosen": -317.52227783203125,
611
+ "logps/rejected": -299.58990478515625,
612
+ "loss": 0.066,
613
+ "rewards/accuracies": 0.4124999940395355,
614
+ "rewards/chosen": -0.012966620735824108,
615
+ "rewards/margins": 0.015139798633754253,
616
+ "rewards/rejected": -0.02810642123222351,
617
+ "step": 390
618
+ },
619
+ {
620
+ "epoch": 0.43,
621
+ "learning_rate": 3.543157986727991e-06,
622
+ "logits/chosen": -1.1576625108718872,
623
+ "logits/rejected": -1.0739920139312744,
624
+ "logps/chosen": -259.60516357421875,
625
+ "logps/rejected": -274.40240478515625,
626
+ "loss": 0.0957,
627
+ "rewards/accuracies": 0.48124998807907104,
628
+ "rewards/chosen": -0.006962036248296499,
629
+ "rewards/margins": 0.03462858498096466,
630
+ "rewards/rejected": -0.041590623557567596,
631
+ "step": 400
632
+ },
633
+ {
634
+ "epoch": 0.43,
635
+ "eval_logits/chosen": -1.0983970165252686,
636
+ "eval_logits/rejected": -1.0450440645217896,
637
+ "eval_logps/chosen": -417.00225830078125,
638
+ "eval_logps/rejected": -415.34869384765625,
639
+ "eval_loss": 0.04426228255033493,
640
+ "eval_rewards/accuracies": 0.578000009059906,
641
+ "eval_rewards/chosen": -0.02069696970283985,
642
+ "eval_rewards/margins": 0.027416307479143143,
643
+ "eval_rewards/rejected": -0.04811327904462814,
644
+ "eval_runtime": 545.9497,
645
+ "eval_samples_per_second": 3.663,
646
+ "eval_steps_per_second": 0.916,
647
+ "step": 400
648
+ },
649
+ {
650
+ "epoch": 0.44,
651
+ "learning_rate": 3.4577844161089614e-06,
652
+ "logits/chosen": -1.0938787460327148,
653
+ "logits/rejected": -1.0761361122131348,
654
+ "logps/chosen": -252.89974975585938,
655
+ "logps/rejected": -279.4630126953125,
656
+ "loss": 0.0703,
657
+ "rewards/accuracies": 0.3687500059604645,
658
+ "rewards/chosen": -0.017448369413614273,
659
+ "rewards/margins": 0.02003558911383152,
660
+ "rewards/rejected": -0.03748396039009094,
661
+ "step": 410
662
+ },
663
+ {
664
+ "epoch": 0.45,
665
+ "learning_rate": 3.3710808134621577e-06,
666
+ "logits/chosen": -1.1595834493637085,
667
+ "logits/rejected": -1.126773476600647,
668
+ "logps/chosen": -298.33538818359375,
669
+ "logps/rejected": -295.24468994140625,
670
+ "loss": 0.0549,
671
+ "rewards/accuracies": 0.41874998807907104,
672
+ "rewards/chosen": -0.016742903739213943,
673
+ "rewards/margins": 0.022577572613954544,
674
+ "rewards/rejected": -0.03932047635316849,
675
+ "step": 420
676
+ },
677
+ {
678
+ "epoch": 0.46,
679
+ "learning_rate": 3.2831675801707126e-06,
680
+ "logits/chosen": -1.1363260746002197,
681
+ "logits/rejected": -1.1643562316894531,
682
+ "logps/chosen": -259.0823669433594,
683
+ "logps/rejected": -262.0459899902344,
684
+ "loss": 0.08,
685
+ "rewards/accuracies": 0.35624998807907104,
686
+ "rewards/chosen": -0.01677551493048668,
687
+ "rewards/margins": 0.012358926236629486,
688
+ "rewards/rejected": -0.029134441167116165,
689
+ "step": 430
690
+ },
691
+ {
692
+ "epoch": 0.47,
693
+ "learning_rate": 3.194166797377289e-06,
694
+ "logits/chosen": -1.0814932584762573,
695
+ "logits/rejected": -1.0023730993270874,
696
+ "logps/chosen": -267.9959411621094,
697
+ "logps/rejected": -241.27685546875,
698
+ "loss": 0.086,
699
+ "rewards/accuracies": 0.3125,
700
+ "rewards/chosen": -0.014996061101555824,
701
+ "rewards/margins": 0.004354935139417648,
702
+ "rewards/rejected": -0.019350996240973473,
703
+ "step": 440
704
+ },
705
+ {
706
+ "epoch": 0.48,
707
+ "learning_rate": 3.104202056455501e-06,
708
+ "logits/chosen": -1.1119335889816284,
709
+ "logits/rejected": -1.036833643913269,
710
+ "logps/chosen": -264.6695251464844,
711
+ "logps/rejected": -294.46173095703125,
712
+ "loss": 0.0732,
713
+ "rewards/accuracies": 0.3812499940395355,
714
+ "rewards/chosen": -0.00541292130947113,
715
+ "rewards/margins": 0.018471624702215195,
716
+ "rewards/rejected": -0.023884546011686325,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 0.49,
721
+ "learning_rate": 3.013398287384144e-06,
722
+ "logits/chosen": -1.100239872932434,
723
+ "logits/rejected": -0.9833891987800598,
724
+ "logps/chosen": -265.06292724609375,
725
+ "logps/rejected": -251.05789184570312,
726
+ "loss": 0.0769,
727
+ "rewards/accuracies": 0.3499999940395355,
728
+ "rewards/chosen": -0.00949514377862215,
729
+ "rewards/margins": 0.011730840429663658,
730
+ "rewards/rejected": -0.021225983276963234,
731
+ "step": 460
732
+ },
733
+ {
734
+ "epoch": 0.5,
735
+ "learning_rate": 2.9218815852625717e-06,
736
+ "logits/chosen": -1.0965584516525269,
737
+ "logits/rejected": -1.1050448417663574,
738
+ "logps/chosen": -245.2798309326172,
739
+ "logps/rejected": -247.6891632080078,
740
+ "loss": 0.0867,
741
+ "rewards/accuracies": 0.375,
742
+ "rewards/chosen": -0.00036931521026417613,
743
+ "rewards/margins": 0.01472887396812439,
744
+ "rewards/rejected": -0.015098191797733307,
745
+ "step": 470
746
+ },
747
+ {
748
+ "epoch": 0.51,
749
+ "learning_rate": 2.829779035208113e-06,
750
+ "logits/chosen": -1.1267783641815186,
751
+ "logits/rejected": -1.0608749389648438,
752
+ "logps/chosen": -276.13861083984375,
753
+ "logps/rejected": -242.2423858642578,
754
+ "loss": 0.0866,
755
+ "rewards/accuracies": 0.4000000059604645,
756
+ "rewards/chosen": 0.003509046044200659,
757
+ "rewards/margins": 0.01866857148706913,
758
+ "rewards/rejected": -0.015159524977207184,
759
+ "step": 480
760
+ },
761
+ {
762
+ "epoch": 0.52,
763
+ "learning_rate": 2.737218535878705e-06,
764
+ "logits/chosen": -1.1088229417800903,
765
+ "logits/rejected": -1.0114442110061646,
766
+ "logps/chosen": -264.47747802734375,
767
+ "logps/rejected": -256.66265869140625,
768
+ "loss": 0.0697,
769
+ "rewards/accuracies": 0.45625001192092896,
770
+ "rewards/chosen": -0.0013200236717239022,
771
+ "rewards/margins": 0.02153395116329193,
772
+ "rewards/rejected": -0.022853974252939224,
773
+ "step": 490
774
+ },
775
+ {
776
+ "epoch": 0.53,
777
+ "learning_rate": 2.64432862186579e-06,
778
+ "logits/chosen": -1.112363338470459,
779
+ "logits/rejected": -1.0634427070617676,
780
+ "logps/chosen": -263.20428466796875,
781
+ "logps/rejected": -262.7369079589844,
782
+ "loss": 0.068,
783
+ "rewards/accuracies": 0.41874998807907104,
784
+ "rewards/chosen": -0.0034271120093762875,
785
+ "rewards/margins": 0.013932084664702415,
786
+ "rewards/rejected": -0.01735919900238514,
787
+ "step": 500
788
+ },
789
+ {
790
+ "epoch": 0.53,
791
+ "eval_logits/chosen": -1.0329285860061646,
792
+ "eval_logits/rejected": -0.9790877103805542,
793
+ "eval_logps/chosen": -402.9732360839844,
794
+ "eval_logps/rejected": -399.08111572265625,
795
+ "eval_loss": 0.04319905489683151,
796
+ "eval_rewards/accuracies": 0.5954999923706055,
797
+ "eval_rewards/chosen": -0.006668027024716139,
798
+ "eval_rewards/margins": 0.025177694857120514,
799
+ "eval_rewards/rejected": -0.03184572234749794,
800
+ "eval_runtime": 545.9259,
801
+ "eval_samples_per_second": 3.664,
802
+ "eval_steps_per_second": 0.916,
803
+ "step": 500
804
+ },
805
+ {
806
+ "epoch": 0.54,
807
+ "learning_rate": 2.551238285204126e-06,
808
+ "logits/chosen": -1.0819242000579834,
809
+ "logits/rejected": -1.0868195295333862,
810
+ "logps/chosen": -204.0397186279297,
811
+ "logps/rejected": -212.2799835205078,
812
+ "loss": 0.0808,
813
+ "rewards/accuracies": 0.375,
814
+ "rewards/chosen": -0.003992350306361914,
815
+ "rewards/margins": 0.010435246862471104,
816
+ "rewards/rejected": -0.01442759484052658,
817
+ "step": 510
818
+ },
819
+ {
820
+ "epoch": 0.55,
821
+ "learning_rate": 2.4580767962463688e-06,
822
+ "logits/chosen": -1.1004348993301392,
823
+ "logits/rejected": -1.0923566818237305,
824
+ "logps/chosen": -287.0587463378906,
825
+ "logps/rejected": -295.1195373535156,
826
+ "loss": 0.0783,
827
+ "rewards/accuracies": 0.45625001192092896,
828
+ "rewards/chosen": -0.005773237906396389,
829
+ "rewards/margins": 0.016825079917907715,
830
+ "rewards/rejected": -0.02259831875562668,
831
+ "step": 520
832
+ },
833
+ {
834
+ "epoch": 0.57,
835
+ "learning_rate": 2.3649735241511546e-06,
836
+ "logits/chosen": -1.025914192199707,
837
+ "logits/rejected": -1.0844639539718628,
838
+ "logps/chosen": -233.93783569335938,
839
+ "logps/rejected": -278.87310791015625,
840
+ "loss": 0.0701,
841
+ "rewards/accuracies": 0.4375,
842
+ "rewards/chosen": 0.0003628075937740505,
843
+ "rewards/margins": 0.019406834617257118,
844
+ "rewards/rejected": -0.019044026732444763,
845
+ "step": 530
846
+ },
847
+ {
848
+ "epoch": 0.58,
849
+ "learning_rate": 2.2720577572339914e-06,
850
+ "logits/chosen": -1.0895086526870728,
851
+ "logits/rejected": -0.9644759297370911,
852
+ "logps/chosen": -313.9569396972656,
853
+ "logps/rejected": -267.2975158691406,
854
+ "loss": 0.0828,
855
+ "rewards/accuracies": 0.375,
856
+ "rewards/chosen": -0.009684056974947453,
857
+ "rewards/margins": 0.013868686743080616,
858
+ "rewards/rejected": -0.02355274185538292,
859
+ "step": 540
860
+ },
861
+ {
862
+ "epoch": 0.59,
863
+ "learning_rate": 2.1794585234303995e-06,
864
+ "logits/chosen": -1.039159893989563,
865
+ "logits/rejected": -1.0821508169174194,
866
+ "logps/chosen": -257.9473571777344,
867
+ "logps/rejected": -274.3190002441406,
868
+ "loss": 0.0743,
869
+ "rewards/accuracies": 0.3687500059604645,
870
+ "rewards/chosen": -0.0062354551628232,
871
+ "rewards/margins": 0.01928626373410225,
872
+ "rewards/rejected": -0.025521719828248024,
873
+ "step": 550
874
+ },
875
+ {
876
+ "epoch": 0.6,
877
+ "learning_rate": 2.0873044111206407e-06,
878
+ "logits/chosen": -1.0668865442276,
879
+ "logits/rejected": -1.0260220766067505,
880
+ "logps/chosen": -271.9998474121094,
881
+ "logps/rejected": -282.83038330078125,
882
+ "loss": 0.0714,
883
+ "rewards/accuracies": 0.4000000059604645,
884
+ "rewards/chosen": 0.0038192705251276493,
885
+ "rewards/margins": 0.019924623891711235,
886
+ "rewards/rejected": -0.016105355694890022,
887
+ "step": 560
888
+ },
889
+ {
890
+ "epoch": 0.61,
891
+ "learning_rate": 1.9957233905648293e-06,
892
+ "logits/chosen": -1.054386019706726,
893
+ "logits/rejected": -1.042152762413025,
894
+ "logps/chosen": -284.9267272949219,
895
+ "logps/rejected": -249.73110961914062,
896
+ "loss": 0.0739,
897
+ "rewards/accuracies": 0.40625,
898
+ "rewards/chosen": 0.0018435310339555144,
899
+ "rewards/margins": 0.013795648701488972,
900
+ "rewards/rejected": -0.011952118948101997,
901
+ "step": 570
902
+ },
903
+ {
904
+ "epoch": 0.62,
905
+ "learning_rate": 1.904842636196402e-06,
906
+ "logits/chosen": -1.027489185333252,
907
+ "logits/rejected": -1.0255085229873657,
908
+ "logps/chosen": -250.985107421875,
909
+ "logps/rejected": -246.9531707763672,
910
+ "loss": 0.0877,
911
+ "rewards/accuracies": 0.40625,
912
+ "rewards/chosen": -0.00342792016454041,
913
+ "rewards/margins": 0.010951442644000053,
914
+ "rewards/rejected": -0.01437936257570982,
915
+ "step": 580
916
+ },
917
+ {
918
+ "epoch": 0.63,
919
+ "learning_rate": 1.814788350020726e-06,
920
+ "logits/chosen": -1.0244547128677368,
921
+ "logits/rejected": -1.0658903121948242,
922
+ "logps/chosen": -295.65191650390625,
923
+ "logps/rejected": -290.0276184082031,
924
+ "loss": 0.0697,
925
+ "rewards/accuracies": 0.42500001192092896,
926
+ "rewards/chosen": -0.00019127638370264322,
927
+ "rewards/margins": 0.02073330618441105,
928
+ "rewards/rejected": -0.020924581214785576,
929
+ "step": 590
930
+ },
931
+ {
932
+ "epoch": 0.64,
933
+ "learning_rate": 1.725685586364051e-06,
934
+ "logits/chosen": -1.0251004695892334,
935
+ "logits/rejected": -1.0600395202636719,
936
+ "logps/chosen": -244.59384155273438,
937
+ "logps/rejected": -243.28256225585938,
938
+ "loss": 0.0847,
939
+ "rewards/accuracies": 0.41874998807907104,
940
+ "rewards/chosen": -0.001342198345810175,
941
+ "rewards/margins": 0.01429178100079298,
942
+ "rewards/rejected": -0.015633979812264442,
943
+ "step": 600
944
+ },
945
+ {
946
+ "epoch": 0.64,
947
+ "eval_logits/chosen": -1.0363810062408447,
948
+ "eval_logits/rejected": -0.9837189316749573,
949
+ "eval_logps/chosen": -401.28790283203125,
950
+ "eval_logps/rejected": -398.47442626953125,
951
+ "eval_loss": 0.042725615203380585,
952
+ "eval_rewards/accuracies": 0.5945000052452087,
953
+ "eval_rewards/chosen": -0.0049826642498373985,
954
+ "eval_rewards/margins": 0.026256347075104713,
955
+ "eval_rewards/rejected": -0.031239010393619537,
956
+ "eval_runtime": 546.1434,
957
+ "eval_samples_per_second": 3.662,
958
+ "eval_steps_per_second": 0.916,
959
+ "step": 600
960
+ },
961
+ {
962
+ "epoch": 0.65,
963
+ "learning_rate": 1.6376580782162172e-06,
964
+ "logits/chosen": -1.03734290599823,
965
+ "logits/rejected": -0.9538782238960266,
966
+ "logps/chosen": -302.88079833984375,
967
+ "logps/rejected": -275.7502136230469,
968
+ "loss": 0.07,
969
+ "rewards/accuracies": 0.4000000059604645,
970
+ "rewards/chosen": -0.003554628463461995,
971
+ "rewards/margins": 0.016584644094109535,
972
+ "rewards/rejected": -0.020139271393418312,
973
+ "step": 610
974
+ },
975
+ {
976
+ "epoch": 0.66,
977
+ "learning_rate": 1.550828065408227e-06,
978
+ "logits/chosen": -1.0758119821548462,
979
+ "logits/rejected": -0.9793124198913574,
980
+ "logps/chosen": -256.09783935546875,
981
+ "logps/rejected": -237.6195831298828,
982
+ "loss": 0.0857,
983
+ "rewards/accuracies": 0.36250001192092896,
984
+ "rewards/chosen": -0.004253728315234184,
985
+ "rewards/margins": 0.020320799201726913,
986
+ "rewards/rejected": -0.024574527516961098,
987
+ "step": 620
988
+ },
989
+ {
990
+ "epoch": 0.67,
991
+ "learning_rate": 1.4653161248633053e-06,
992
+ "logits/chosen": -1.1112511157989502,
993
+ "logits/rejected": -1.0120253562927246,
994
+ "logps/chosen": -217.60305786132812,
995
+ "logps/rejected": -254.78884887695312,
996
+ "loss": 0.099,
997
+ "rewards/accuracies": 0.3812499940395355,
998
+ "rewards/chosen": -0.007809498347342014,
999
+ "rewards/margins": 0.020094871520996094,
1000
+ "rewards/rejected": -0.027904370799660683,
1001
+ "step": 630
1002
+ },
1003
+ {
1004
+ "epoch": 0.68,
1005
+ "learning_rate": 1.381241003157162e-06,
1006
+ "logits/chosen": -1.1298284530639648,
1007
+ "logits/rejected": -1.0621023178100586,
1008
+ "logps/chosen": -286.02459716796875,
1009
+ "logps/rejected": -299.3070373535156,
1010
+ "loss": 0.0677,
1011
+ "rewards/accuracies": 0.39375001192092896,
1012
+ "rewards/chosen": -0.011842104606330395,
1013
+ "rewards/margins": 0.014377683401107788,
1014
+ "rewards/rejected": -0.026219788938760757,
1015
+ "step": 640
1016
+ },
1017
+ {
1018
+ "epoch": 0.69,
1019
+ "learning_rate": 1.298719451619979e-06,
1020
+ "logits/chosen": -1.0666725635528564,
1021
+ "logits/rejected": -1.0321277379989624,
1022
+ "logps/chosen": -300.1710205078125,
1023
+ "logps/rejected": -384.95751953125,
1024
+ "loss": 0.073,
1025
+ "rewards/accuracies": 0.5,
1026
+ "rewards/chosen": -0.0033969897776842117,
1027
+ "rewards/margins": 0.039332348853349686,
1028
+ "rewards/rejected": -0.04272934049367905,
1029
+ "step": 650
1030
+ },
1031
+ {
1032
+ "epoch": 0.7,
1033
+ "learning_rate": 1.2178660642091036e-06,
1034
+ "logits/chosen": -1.062753677368164,
1035
+ "logits/rejected": -1.034977674484253,
1036
+ "logps/chosen": -275.3869934082031,
1037
+ "logps/rejected": -275.52667236328125,
1038
+ "loss": 0.0798,
1039
+ "rewards/accuracies": 0.46875,
1040
+ "rewards/chosen": -0.0005760884960182011,
1041
+ "rewards/margins": 0.0214390866458416,
1042
+ "rewards/rejected": -0.022015176713466644,
1043
+ "step": 660
1044
+ },
1045
+ {
1046
+ "epoch": 0.71,
1047
+ "learning_rate": 1.1387931183775821e-06,
1048
+ "logits/chosen": -1.072608470916748,
1049
+ "logits/rejected": -1.0768264532089233,
1050
+ "logps/chosen": -300.6933288574219,
1051
+ "logps/rejected": -286.68951416015625,
1052
+ "loss": 0.0876,
1053
+ "rewards/accuracies": 0.40625,
1054
+ "rewards/chosen": -0.005371665116399527,
1055
+ "rewards/margins": 0.01708284579217434,
1056
+ "rewards/rejected": -0.022454511374235153,
1057
+ "step": 670
1058
+ },
1059
+ {
1060
+ "epoch": 0.73,
1061
+ "learning_rate": 1.061610419159532e-06,
1062
+ "logits/chosen": -1.1137611865997314,
1063
+ "logits/rejected": -1.0089493989944458,
1064
+ "logps/chosen": -284.4278259277344,
1065
+ "logps/rejected": -263.4560546875,
1066
+ "loss": 0.0803,
1067
+ "rewards/accuracies": 0.4000000059604645,
1068
+ "rewards/chosen": -0.0019712348002940416,
1069
+ "rewards/margins": 0.015994885936379433,
1070
+ "rewards/rejected": -0.017966121435165405,
1071
+ "step": 680
1072
+ },
1073
+ {
1074
+ "epoch": 0.74,
1075
+ "learning_rate": 9.864251466888364e-07,
1076
+ "logits/chosen": -1.0998120307922363,
1077
+ "logits/rejected": -1.0427272319793701,
1078
+ "logps/chosen": -273.5634460449219,
1079
+ "logps/rejected": -273.6974792480469,
1080
+ "loss": 0.0779,
1081
+ "rewards/accuracies": 0.40625,
1082
+ "rewards/chosen": -0.00520918658003211,
1083
+ "rewards/margins": 0.018212206661701202,
1084
+ "rewards/rejected": -0.02342139557003975,
1085
+ "step": 690
1086
+ },
1087
+ {
1088
+ "epoch": 0.75,
1089
+ "learning_rate": 9.133417073629288e-07,
1090
+ "logits/chosen": -1.0788428783416748,
1091
+ "logits/rejected": -1.0710010528564453,
1092
+ "logps/chosen": -295.2659912109375,
1093
+ "logps/rejected": -301.9012145996094,
1094
+ "loss": 0.0519,
1095
+ "rewards/accuracies": 0.45625001192092896,
1096
+ "rewards/chosen": -0.006222005933523178,
1097
+ "rewards/margins": 0.023821452632546425,
1098
+ "rewards/rejected": -0.030043456703424454,
1099
+ "step": 700
1100
+ },
1101
+ {
1102
+ "epoch": 0.75,
1103
+ "eval_logits/chosen": -1.0359783172607422,
1104
+ "eval_logits/rejected": -0.9871743321418762,
1105
+ "eval_logps/chosen": -404.5330505371094,
1106
+ "eval_logps/rejected": -404.9790954589844,
1107
+ "eval_loss": 0.04230288788676262,
1108
+ "eval_rewards/accuracies": 0.590499997138977,
1109
+ "eval_rewards/chosen": -0.008227824233472347,
1110
+ "eval_rewards/margins": 0.02951584756374359,
1111
+ "eval_rewards/rejected": -0.03774367272853851,
1112
+ "eval_runtime": 545.9714,
1113
+ "eval_samples_per_second": 3.663,
1114
+ "eval_steps_per_second": 0.916,
1115
+ "step": 700
1116
+ },
1117
+ {
1118
+ "epoch": 0.76,
1119
+ "learning_rate": 8.424615888583332e-07,
1120
+ "logits/chosen": -1.0752325057983398,
1121
+ "logits/rejected": -1.005172848701477,
1122
+ "logps/chosen": -263.06549072265625,
1123
+ "logps/rejected": -263.9055480957031,
1124
+ "loss": 0.0702,
1125
+ "rewards/accuracies": 0.45625001192092896,
1126
+ "rewards/chosen": -0.0012165942462161183,
1127
+ "rewards/margins": 0.0245666466653347,
1128
+ "rewards/rejected": -0.025783240795135498,
1129
+ "step": 710
1130
+ },
1131
+ {
1132
+ "epoch": 0.77,
1133
+ "learning_rate": 7.738832191993092e-07,
1134
+ "logits/chosen": -1.0686012506484985,
1135
+ "logits/rejected": -1.0680710077285767,
1136
+ "logps/chosen": -260.68853759765625,
1137
+ "logps/rejected": -299.7347717285156,
1138
+ "loss": 0.0669,
1139
+ "rewards/accuracies": 0.375,
1140
+ "rewards/chosen": -0.006182204931974411,
1141
+ "rewards/margins": 0.016528166830539703,
1142
+ "rewards/rejected": -0.022710371762514114,
1143
+ "step": 720
1144
+ },
1145
+ {
1146
+ "epoch": 0.78,
1147
+ "learning_rate": 7.077018300752917e-07,
1148
+ "logits/chosen": -1.1110173463821411,
1149
+ "logits/rejected": -1.0842745304107666,
1150
+ "logps/chosen": -270.8677673339844,
1151
+ "logps/rejected": -273.29339599609375,
1152
+ "loss": 0.07,
1153
+ "rewards/accuracies": 0.4312500059604645,
1154
+ "rewards/chosen": -0.0033080249559134245,
1155
+ "rewards/margins": 0.02419392392039299,
1156
+ "rewards/rejected": -0.027501946315169334,
1157
+ "step": 730
1158
+ },
1159
+ {
1160
+ "epoch": 0.79,
1161
+ "learning_rate": 6.440093245969342e-07,
1162
+ "logits/chosen": -1.1270530223846436,
1163
+ "logits/rejected": -1.0608434677124023,
1164
+ "logps/chosen": -296.01556396484375,
1165
+ "logps/rejected": -293.2162170410156,
1166
+ "loss": 0.0753,
1167
+ "rewards/accuracies": 0.41874998807907104,
1168
+ "rewards/chosen": -0.004587044008076191,
1169
+ "rewards/margins": 0.017461195588111877,
1170
+ "rewards/rejected": -0.022048238664865494,
1171
+ "step": 740
1172
+ },
1173
+ {
1174
+ "epoch": 0.8,
1175
+ "learning_rate": 5.828941496744075e-07,
1176
+ "logits/chosen": -1.1132383346557617,
1177
+ "logits/rejected": -1.0976136922836304,
1178
+ "logps/chosen": -258.3694763183594,
1179
+ "logps/rejected": -271.54095458984375,
1180
+ "loss": 0.0907,
1181
+ "rewards/accuracies": 0.38749998807907104,
1182
+ "rewards/chosen": -0.012257089838385582,
1183
+ "rewards/margins": 0.019415555521845818,
1184
+ "rewards/rejected": -0.0316726490855217,
1185
+ "step": 750
1186
+ },
1187
+ {
1188
+ "epoch": 0.81,
1189
+ "learning_rate": 5.244411731951671e-07,
1190
+ "logits/chosen": -1.1247339248657227,
1191
+ "logits/rejected": -1.0423662662506104,
1192
+ "logps/chosen": -287.46160888671875,
1193
+ "logps/rejected": -290.23736572265625,
1194
+ "loss": 0.0781,
1195
+ "rewards/accuracies": 0.39375001192092896,
1196
+ "rewards/chosen": -0.008325648494064808,
1197
+ "rewards/margins": 0.021062636747956276,
1198
+ "rewards/rejected": -0.02938828244805336,
1199
+ "step": 760
1200
+ },
1201
+ {
1202
+ "epoch": 0.82,
1203
+ "learning_rate": 4.6873156617173594e-07,
1204
+ "logits/chosen": -1.1069273948669434,
1205
+ "logits/rejected": -1.086753010749817,
1206
+ "logps/chosen": -298.63433837890625,
1207
+ "logps/rejected": -294.5317077636719,
1208
+ "loss": 0.0679,
1209
+ "rewards/accuracies": 0.4437499940395355,
1210
+ "rewards/chosen": -0.007812710478901863,
1211
+ "rewards/margins": 0.02419663593173027,
1212
+ "rewards/rejected": -0.032009344547986984,
1213
+ "step": 770
1214
+ },
1215
+ {
1216
+ "epoch": 0.83,
1217
+ "learning_rate": 4.1584269002318653e-07,
1218
+ "logits/chosen": -1.1254509687423706,
1219
+ "logits/rejected": -1.0976530313491821,
1220
+ "logps/chosen": -287.83282470703125,
1221
+ "logps/rejected": -289.3851623535156,
1222
+ "loss": 0.0738,
1223
+ "rewards/accuracies": 0.4437499940395355,
1224
+ "rewards/chosen": -0.009489515796303749,
1225
+ "rewards/margins": 0.02022114023566246,
1226
+ "rewards/rejected": -0.029710659757256508,
1227
+ "step": 780
1228
+ },
1229
+ {
1230
+ "epoch": 0.84,
1231
+ "learning_rate": 3.658479891468258e-07,
1232
+ "logits/chosen": -1.1135103702545166,
1233
+ "logits/rejected": -1.0031986236572266,
1234
+ "logps/chosen": -292.74835205078125,
1235
+ "logps/rejected": -297.3404846191406,
1236
+ "loss": 0.0784,
1237
+ "rewards/accuracies": 0.3687500059604645,
1238
+ "rewards/chosen": -0.003799052909016609,
1239
+ "rewards/margins": 0.016090305522084236,
1240
+ "rewards/rejected": -0.019889358431100845,
1241
+ "step": 790
1242
+ },
1243
+ {
1244
+ "epoch": 0.85,
1245
+ "learning_rate": 3.18816888929272e-07,
1246
+ "logits/chosen": -1.0912225246429443,
1247
+ "logits/rejected": -1.0440576076507568,
1248
+ "logps/chosen": -273.9382019042969,
1249
+ "logps/rejected": -279.7242126464844,
1250
+ "loss": 0.0742,
1251
+ "rewards/accuracies": 0.4437499940395355,
1252
+ "rewards/chosen": -0.00514855096116662,
1253
+ "rewards/margins": 0.019596170634031296,
1254
+ "rewards/rejected": -0.024744722992181778,
1255
+ "step": 800
1256
+ },
1257
+ {
1258
+ "epoch": 0.85,
1259
+ "eval_logits/chosen": -1.055617332458496,
1260
+ "eval_logits/rejected": -1.0109219551086426,
1261
+ "eval_logps/chosen": -406.80352783203125,
1262
+ "eval_logps/rejected": -409.2462158203125,
1263
+ "eval_loss": 0.04217638820409775,
1264
+ "eval_rewards/accuracies": 0.6000000238418579,
1265
+ "eval_rewards/chosen": -0.010498268529772758,
1266
+ "eval_rewards/margins": 0.0315125547349453,
1267
+ "eval_rewards/rejected": -0.042010821402072906,
1268
+ "eval_runtime": 546.1165,
1269
+ "eval_samples_per_second": 3.662,
1270
+ "eval_steps_per_second": 0.916,
1271
+ "step": 800
1272
+ },
1273
+ {
1274
+ "epoch": 0.86,
1275
+ "learning_rate": 2.748146993385484e-07,
1276
+ "logits/chosen": -1.118160367012024,
1277
+ "logits/rejected": -1.0629098415374756,
1278
+ "logps/chosen": -223.4706573486328,
1279
+ "logps/rejected": -247.4434356689453,
1280
+ "loss": 0.0963,
1281
+ "rewards/accuracies": 0.36250001192092896,
1282
+ "rewards/chosen": -0.0038195624947547913,
1283
+ "rewards/margins": 0.02616865560412407,
1284
+ "rewards/rejected": -0.029988214373588562,
1285
+ "step": 810
1286
+ },
1287
+ {
1288
+ "epoch": 0.87,
1289
+ "learning_rate": 2.3390252423108077e-07,
1290
+ "logits/chosen": -1.1013834476470947,
1291
+ "logits/rejected": -1.1045656204223633,
1292
+ "logps/chosen": -240.3532257080078,
1293
+ "logps/rejected": -258.44989013671875,
1294
+ "loss": 0.0812,
1295
+ "rewards/accuracies": 0.41874998807907104,
1296
+ "rewards/chosen": -0.002859788713976741,
1297
+ "rewards/margins": 0.029717862606048584,
1298
+ "rewards/rejected": -0.03257765248417854,
1299
+ "step": 820
1300
+ },
1301
+ {
1302
+ "epoch": 0.89,
1303
+ "learning_rate": 1.961371764995243e-07,
1304
+ "logits/chosen": -1.1316301822662354,
1305
+ "logits/rejected": -1.1215277910232544,
1306
+ "logps/chosen": -266.54931640625,
1307
+ "logps/rejected": -260.0931701660156,
1308
+ "loss": 0.0782,
1309
+ "rewards/accuracies": 0.39375001192092896,
1310
+ "rewards/chosen": -0.006319983396679163,
1311
+ "rewards/margins": 0.014086413197219372,
1312
+ "rewards/rejected": -0.020406395196914673,
1313
+ "step": 830
1314
+ },
1315
+ {
1316
+ "epoch": 0.9,
1317
+ "learning_rate": 1.61571099179261e-07,
1318
+ "logits/chosen": -1.1706424951553345,
1319
+ "logits/rejected": -1.0354385375976562,
1320
+ "logps/chosen": -278.2705993652344,
1321
+ "logps/rejected": -276.22967529296875,
1322
+ "loss": 0.0814,
1323
+ "rewards/accuracies": 0.4000000059604645,
1324
+ "rewards/chosen": -0.009933208115398884,
1325
+ "rewards/margins": 0.019498441368341446,
1326
+ "rewards/rejected": -0.029431650415062904,
1327
+ "step": 840
1328
+ },
1329
+ {
1330
+ "epoch": 0.91,
1331
+ "learning_rate": 1.3025229262312367e-07,
1332
+ "logits/chosen": -1.0535192489624023,
1333
+ "logits/rejected": -1.0249392986297607,
1334
+ "logps/chosen": -238.9296875,
1335
+ "logps/rejected": -260.4687805175781,
1336
+ "loss": 0.0732,
1337
+ "rewards/accuracies": 0.4000000059604645,
1338
+ "rewards/chosen": -0.0005777518963441253,
1339
+ "rewards/margins": 0.021599723026156425,
1340
+ "rewards/rejected": -0.022177476435899734,
1341
+ "step": 850
1342
+ },
1343
+ {
1344
+ "epoch": 0.92,
1345
+ "learning_rate": 1.0222424784546853e-07,
1346
+ "logits/chosen": -1.1237046718597412,
1347
+ "logits/rejected": -1.1744358539581299,
1348
+ "logps/chosen": -290.61187744140625,
1349
+ "logps/rejected": -289.50775146484375,
1350
+ "loss": 0.0682,
1351
+ "rewards/accuracies": 0.45625001192092896,
1352
+ "rewards/chosen": -0.00292245764285326,
1353
+ "rewards/margins": 0.022101474925875664,
1354
+ "rewards/rejected": -0.0250239335000515,
1355
+ "step": 860
1356
+ },
1357
+ {
1358
+ "epoch": 0.93,
1359
+ "learning_rate": 7.752588612816553e-08,
1360
+ "logits/chosen": -1.0622873306274414,
1361
+ "logits/rejected": -1.047603964805603,
1362
+ "logps/chosen": -290.8896484375,
1363
+ "logps/rejected": -276.6893005371094,
1364
+ "loss": 0.0607,
1365
+ "rewards/accuracies": 0.4000000059604645,
1366
+ "rewards/chosen": -0.005569613538682461,
1367
+ "rewards/margins": 0.024782858788967133,
1368
+ "rewards/rejected": -0.030352476984262466,
1369
+ "step": 870
1370
+ },
1371
+ {
1372
+ "epoch": 0.94,
1373
+ "learning_rate": 5.619150497236991e-08,
1374
+ "logits/chosen": -1.1374906301498413,
1375
+ "logits/rejected": -1.079929232597351,
1376
+ "logps/chosen": -223.29623413085938,
1377
+ "logps/rejected": -222.3146209716797,
1378
+ "loss": 0.0661,
1379
+ "rewards/accuracies": 0.3687500059604645,
1380
+ "rewards/chosen": -0.0003207772097084671,
1381
+ "rewards/margins": 0.016658511012792587,
1382
+ "rewards/rejected": -0.016979288309812546,
1383
+ "step": 880
1384
+ },
1385
+ {
1386
+ "epoch": 0.95,
1387
+ "learning_rate": 3.825073047112743e-08,
1388
+ "logits/chosen": -1.0372432470321655,
1389
+ "logits/rejected": -1.0072650909423828,
1390
+ "logps/chosen": -297.5245056152344,
1391
+ "logps/rejected": -319.53076171875,
1392
+ "loss": 0.0776,
1393
+ "rewards/accuracies": 0.45625001192092896,
1394
+ "rewards/chosen": -0.007688297424465418,
1395
+ "rewards/margins": 0.022589299827814102,
1396
+ "rewards/rejected": -0.030277591198682785,
1397
+ "step": 890
1398
+ },
1399
+ {
1400
+ "epoch": 0.96,
1401
+ "learning_rate": 2.372847616895685e-08,
1402
+ "logits/chosen": -1.096644639968872,
1403
+ "logits/rejected": -1.0533974170684814,
1404
+ "logps/chosen": -268.70343017578125,
1405
+ "logps/rejected": -255.3015594482422,
1406
+ "loss": 0.0768,
1407
+ "rewards/accuracies": 0.41874998807907104,
1408
+ "rewards/chosen": -0.003904106793925166,
1409
+ "rewards/margins": 0.022419685497879982,
1410
+ "rewards/rejected": -0.02632378600537777,
1411
+ "step": 900
1412
+ },
1413
+ {
1414
+ "epoch": 0.96,
1415
+ "eval_logits/chosen": -1.0502386093139648,
1416
+ "eval_logits/rejected": -1.0050266981124878,
1417
+ "eval_logps/chosen": -406.3475341796875,
1418
+ "eval_logps/rejected": -408.7396545410156,
1419
+ "eval_loss": 0.04211420938372612,
1420
+ "eval_rewards/accuracies": 0.5929999947547913,
1421
+ "eval_rewards/chosen": -0.01004225667566061,
1422
+ "eval_rewards/margins": 0.031461965292692184,
1423
+ "eval_rewards/rejected": -0.04150421544909477,
1424
+ "eval_runtime": 545.8034,
1425
+ "eval_samples_per_second": 3.664,
1426
+ "eval_steps_per_second": 0.916,
1427
+ "step": 900
1428
+ },
1429
+ {
1430
+ "epoch": 0.97,
1431
+ "learning_rate": 1.264490846553279e-08,
1432
+ "logits/chosen": -1.1268881559371948,
1433
+ "logits/rejected": -1.0797778367996216,
1434
+ "logps/chosen": -271.73004150390625,
1435
+ "logps/rejected": -292.3177490234375,
1436
+ "loss": 0.0679,
1437
+ "rewards/accuracies": 0.41874998807907104,
1438
+ "rewards/chosen": -0.008194219321012497,
1439
+ "rewards/margins": 0.016915880143642426,
1440
+ "rewards/rejected": -0.025110099464654922,
1441
+ "step": 910
1442
+ },
1443
+ {
1444
+ "epoch": 0.98,
1445
+ "learning_rate": 5.015418611516165e-09,
1446
+ "logits/chosen": -1.13059401512146,
1447
+ "logits/rejected": -1.099302887916565,
1448
+ "logps/chosen": -252.9181365966797,
1449
+ "logps/rejected": -257.0358581542969,
1450
+ "loss": 0.0898,
1451
+ "rewards/accuracies": 0.3687500059604645,
1452
+ "rewards/chosen": -0.005635020788758993,
1453
+ "rewards/margins": 0.021289747208356857,
1454
+ "rewards/rejected": -0.026924768462777138,
1455
+ "step": 920
1456
+ },
1457
+ {
1458
+ "epoch": 0.99,
1459
+ "learning_rate": 8.506013354186993e-10,
1460
+ "logits/chosen": -1.0815865993499756,
1461
+ "logits/rejected": -1.0594186782836914,
1462
+ "logps/chosen": -270.22955322265625,
1463
+ "logps/rejected": -268.7232360839844,
1464
+ "loss": 0.0661,
1465
+ "rewards/accuracies": 0.40625,
1466
+ "rewards/chosen": 0.0016765497857704759,
1467
+ "rewards/margins": 0.02203894779086113,
1468
+ "rewards/rejected": -0.020362399518489838,
1469
+ "step": 930
1470
+ },
1471
+ {
1472
+ "epoch": 1.0,
1473
+ "step": 937,
1474
+ "total_flos": 0.0,
1475
+ "train_loss": 0.07849642387894454,
1476
+ "train_runtime": 13138.6455,
1477
+ "train_samples_per_second": 1.142,
1478
+ "train_steps_per_second": 0.071
1479
+ }
1480
+ ],
1481
+ "logging_steps": 10,
1482
+ "max_steps": 937,
1483
+ "num_input_tokens_seen": 0,
1484
+ "num_train_epochs": 1,
1485
+ "save_steps": 100,
1486
+ "total_flos": 0.0,
1487
+ "train_batch_size": 4,
1488
+ "trial_name": null,
1489
+ "trial_params": null
1490
+ }