Broyojo commited on
Commit
b5ae70e
·
verified ·
1 Parent(s): 22aa67f

upload E2b (grpo_thinking)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: Qwen/Qwen3-VL-2B-Thinking
4
+ library_name: peft
5
+ tags:
6
+ - vision-language
7
+ - new-yorker
8
+ - humor
9
+ - rlhf
10
+ - grpo-thinking
11
+ datasets:
12
+ - yguooo/newyorker_caption_ranking
13
+ language:
14
+ - en
15
+ ---
16
+
17
+ # humor-r1 — GRPO, with thinking (Qwen3-VL-2B-Thinking + LoRA) (E2b)
18
+
19
+ LoRA on Qwen3-VL-2B-Thinking trained via GRPO against the Bradley-Terry reward model HumorR1/rm-qwen25vl-3b-nodesc. Output format: `{thinking}</think>\n\n<caption>X</caption>`.
20
+
21
+ ## Training data
22
+
23
+ - 271 New Yorker contests, top-rated caption per contest
24
+ (`yguooo/newyorker_caption_ranking`).
25
+ - The 60k Bradley-Terry preference pairs underlying the reward model
26
+ (separate split).
27
+ - We deliberately do NOT use the dataset's GPT-4o-generated
28
+ Scene/Twist/Location/Entities descriptions in the prompt, since they
29
+ hand-feed scene content to a vision-language model that can already
30
+ see the image; this makes the policy and reward model usable on any
31
+ single-panel cartoon, not just the curated subset.
32
+
33
+ ## How it fits the project
34
+
35
+ Part of a 2x2 ablation over training method (SFT, GRPO) and output
36
+ format (no thinking, thinking) for humor caption generation. See
37
+ `HumorR1/rm-qwen25vl-3b-nodesc` for the reward model used to train (and
38
+ score) this policy.
39
+
40
+ ## Inference
41
+
42
+ Backbone: `Qwen/Qwen3-VL-2B-Thinking`.
43
+ This repo is a LoRA adapter; load with `peft.PeftModel.from_pretrained`.
44
+
45
+ ```python
46
+ from PIL import Image
47
+ from transformers import AutoProcessor
48
+ from vllm import LLM, SamplingParams
49
+ from vllm.lora.request import LoRARequest
50
+
51
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Thinking", trust_remote_code=True)
52
+ llm = LLM(model="Qwen/Qwen3-VL-2B-Thinking", trust_remote_code=True, dtype="bfloat16",
53
+ enable_lora=True, max_lora_rank=32, max_model_len=4096)
54
+
55
+ # Caption format: <caption>X</caption>; thinking variant prefixes <think>...</think>.
56
+ ```
57
+
58
+ ## Reward model used during training
59
+
60
+ - `HumorR1/rm-qwen25vl-3b-nodesc` (held-out pairwise accuracy 0.6635).
adapter_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen3-VL-2B-Thinking",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 32,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "down_proj",
34
+ "q_proj",
35
+ "linear_fc2",
36
+ "qkv",
37
+ "gate_proj",
38
+ "k_proj",
39
+ "linear_fc1",
40
+ "v_proj",
41
+ "attn.proj",
42
+ "o_proj",
43
+ "up_proj"
44
+ ],
45
+ "target_parameters": null,
46
+ "task_type": "CAUSAL_LM",
47
+ "trainable_token_indices": null,
48
+ "use_bdlora": null,
49
+ "use_dora": false,
50
+ "use_qalora": false,
51
+ "use_rslora": false
52
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16b6f0b7e79b8390a3710b8e2ea4c21378f3343bed13af9563feaca3c6af7e0b
3
+ size 197219552
chat_template.jinja ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- set image_count = namespace(value=0) %}
2
+ {%- set video_count = namespace(value=0) %}
3
+ {%- macro render_content(content, do_vision_count) %}
4
+ {%- if content is string %}
5
+ {{- content }}
6
+ {%- else %}
7
+ {%- for item in content %}
8
+ {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
9
+ {%- if do_vision_count %}
10
+ {%- set image_count.value = image_count.value + 1 %}
11
+ {%- endif %}
12
+ {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
13
+ <|vision_start|><|image_pad|><|vision_end|>
14
+ {%- elif 'video' in item or item.type == 'video' %}
15
+ {%- if do_vision_count %}
16
+ {%- set video_count.value = video_count.value + 1 %}
17
+ {%- endif %}
18
+ {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
19
+ <|vision_start|><|video_pad|><|vision_end|>
20
+ {%- elif 'text' in item %}
21
+ {{- item.text }}
22
+ {%- endif %}
23
+ {%- endfor %}
24
+ {%- endif %}
25
+ {%- endmacro %}
26
+ {%- if tools %}
27
+ {{- '<|im_start|>system\n' }}
28
+ {%- if messages[0].role == 'system' %}
29
+ {{- render_content(messages[0].content, false) + '\n\n' }}
30
+ {%- endif %}
31
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
32
+ {%- for tool in tools %}
33
+ {{- "\n" }}
34
+ {{- tool | tojson }}
35
+ {%- endfor %}
36
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
37
+ {%- else %}
38
+ {%- if messages[0].role == 'system' %}
39
+ {{- '<|im_start|>system\n' + render_content(messages[0].content, false) + '<|im_end|>\n' }}
40
+ {%- endif %}
41
+ {%- endif %}
42
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
43
+ {%- for message in messages[::-1] %}
44
+ {%- set index = (messages|length - 1) - loop.index0 %}
45
+ {%- if ns.multi_step_tool and message.role == "user" %}
46
+ {%- set content = render_content(message.content, false) %}
47
+ {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
48
+ {%- set ns.multi_step_tool = false %}
49
+ {%- set ns.last_query_index = index %}
50
+ {%- endif %}
51
+ {%- endif %}
52
+ {%- endfor %}
53
+ {%- for message in messages %}
54
+ {%- set content = render_content(message.content, True) %}
55
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
56
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
57
+ {%- elif message.role == "assistant" %}
58
+ {%- set reasoning_content = '' %}
59
+ {%- if message.reasoning_content is string %}
60
+ {%- set reasoning_content = message.reasoning_content %}
61
+ {%- else %}
62
+ {%- if '</think>' in content %}
63
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
64
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
65
+ {%- endif %}
66
+ {%- endif %}
67
+ {%- if loop.index0 > ns.last_query_index %}
68
+ {%- if loop.last or (not loop.last and reasoning_content) %}
69
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
70
+ {%- else %}
71
+ {{- '<|im_start|>' + message.role + '\n' + content }}
72
+ {%- endif %}
73
+ {%- else %}
74
+ {{- '<|im_start|>' + message.role + '\n' + content }}
75
+ {%- endif %}
76
+ {%- if message.tool_calls %}
77
+ {%- for tool_call in message.tool_calls %}
78
+ {%- if (loop.first and content) or (not loop.first) %}
79
+ {{- '\n' }}
80
+ {%- endif %}
81
+ {%- if tool_call.function %}
82
+ {%- set tool_call = tool_call.function %}
83
+ {%- endif %}
84
+ {{- '<tool_call>\n{"name": "' }}
85
+ {{- tool_call.name }}
86
+ {{- '", "arguments": ' }}
87
+ {%- if tool_call.arguments is string %}
88
+ {{- tool_call.arguments }}
89
+ {%- else %}
90
+ {{- tool_call.arguments | tojson }}
91
+ {%- endif %}
92
+ {{- '}\n</tool_call>' }}
93
+ {%- endfor %}
94
+ {%- endif %}
95
+ {{- '<|im_end|>\n' }}
96
+ {%- elif message.role == "tool" %}
97
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
98
+ {{- '<|im_start|>user' }}
99
+ {%- endif %}
100
+ {{- '\n<tool_response>\n' }}
101
+ {{- content }}
102
+ {{- '\n</tool_response>' }}
103
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
104
+ {{- '<|im_end|>\n' }}
105
+ {%- endif %}
106
+ {%- endif %}
107
+ {%- endfor %}
108
+ {%- if add_generation_prompt %}
109
+ {{- '<|im_start|>assistant\n<think>\n' }}
110
+ {%- endif %}
processor_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_processor": {
3
+ "do_convert_rgb": true,
4
+ "do_normalize": true,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Qwen2VLImageProcessor",
13
+ "image_std": [
14
+ 0.5,
15
+ 0.5,
16
+ 0.5
17
+ ],
18
+ "merge_size": 2,
19
+ "patch_size": 16,
20
+ "resample": 3,
21
+ "rescale_factor": 0.00392156862745098,
22
+ "size": {
23
+ "longest_edge": 16777216,
24
+ "shortest_edge": 65536
25
+ },
26
+ "temporal_patch_size": 2
27
+ },
28
+ "processor_class": "Qwen3VLProcessor",
29
+ "video_processor": {
30
+ "do_convert_rgb": true,
31
+ "do_normalize": true,
32
+ "do_rescale": true,
33
+ "do_resize": true,
34
+ "do_sample_frames": true,
35
+ "fps": 2,
36
+ "image_mean": [
37
+ 0.5,
38
+ 0.5,
39
+ 0.5
40
+ ],
41
+ "image_std": [
42
+ 0.5,
43
+ 0.5,
44
+ 0.5
45
+ ],
46
+ "max_frames": 768,
47
+ "merge_size": 2,
48
+ "min_frames": 4,
49
+ "patch_size": 16,
50
+ "resample": 3,
51
+ "rescale_factor": 0.00392156862745098,
52
+ "return_metadata": false,
53
+ "size": {
54
+ "longest_edge": 25165824,
55
+ "shortest_edge": 4096
56
+ },
57
+ "temporal_patch_size": 2,
58
+ "video_processor_type": "Qwen3VLVideoProcessor"
59
+ }
60
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79cb3c783570f1b8fe73b9ed530ae50cae9ce4b6344c0b5edefc50478847eaa4
3
+ size 11422817
tokenizer_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 262144,
25
+ "pad_token": "<|endoftext|>",
26
+ "padding_side": "left",
27
+ "processor_class": "Qwen3VLProcessor",
28
+ "split_special_tokens": false,
29
+ "tokenizer_class": "Qwen2Tokenizer",
30
+ "truncation_side": "left",
31
+ "unk_token": null
32
+ }
trainer_state.json ADDED
@@ -0,0 +1,1434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.18450184501845018,
6
+ "eval_steps": 500,
7
+ "global_step": 50,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "cispo_clip_ratio": 0.364774439483881,
14
+ "completions/clipped_ratio": 1.0,
15
+ "completions/max_length": 5409.0,
16
+ "completions/max_terminated_length": 0.0,
17
+ "completions/mean_length": 2607.8125,
18
+ "completions/mean_terminated_length": 0.0,
19
+ "completions/min_length": 674.0,
20
+ "completions/min_terminated_length": 0.0,
21
+ "entropy": 1.4863766431808472,
22
+ "epoch": 0.0036900369003690036,
23
+ "frac_reward_zero_std": 0.0,
24
+ "grad_norm": 1.6501518985023722e-05,
25
+ "learning_rate": 0.0002,
26
+ "loss": -5.250487447483465e-05,
27
+ "num_tokens": 45661.0,
28
+ "reward": 0.2993389666080475,
29
+ "reward_std": 0.17980147898197174,
30
+ "rewards/humor_reward/mean": 0.2993389666080475,
31
+ "rewards/humor_reward/std": 0.17980147898197174,
32
+ "sampling/importance_sampling_ratio/max": 0.0032514820341020823,
33
+ "sampling/importance_sampling_ratio/mean": 0.00025803607422858477,
34
+ "sampling/importance_sampling_ratio/min": 6.591494638996664e-08,
35
+ "sampling/sampling_logp_difference/max": 3.1304593086242676,
36
+ "sampling/sampling_logp_difference/mean": 0.04722540080547333,
37
+ "step": 1,
38
+ "step_time": 79.16197230700345
39
+ },
40
+ {
41
+ "cispo_clip_ratio": 0.4739677682518959,
42
+ "completions/clipped_ratio": 0.9375,
43
+ "completions/max_length": 4785.0,
44
+ "completions/max_terminated_length": 439.0,
45
+ "completions/mean_length": 2616.0625,
46
+ "completions/mean_terminated_length": 439.0,
47
+ "completions/min_length": 439.0,
48
+ "completions/min_terminated_length": 439.0,
49
+ "entropy": 1.2316511571407318,
50
+ "epoch": 0.007380073800738007,
51
+ "frac_reward_zero_std": 0.0,
52
+ "grad_norm": 0.003729997668415308,
53
+ "learning_rate": 0.0002,
54
+ "loss": -0.003799113677814603,
55
+ "num_tokens": 91230.0,
56
+ "reward": 0.5310899019241333,
57
+ "reward_std": 0.27845898270606995,
58
+ "rewards/humor_reward/mean": 0.5310899019241333,
59
+ "rewards/humor_reward/std": 0.27845898270606995,
60
+ "sampling/importance_sampling_ratio/max": 1.048789381980896,
61
+ "sampling/importance_sampling_ratio/mean": 0.12509621679782867,
62
+ "sampling/importance_sampling_ratio/min": 0.0,
63
+ "sampling/sampling_logp_difference/max": 2.2385149002075195,
64
+ "sampling/sampling_logp_difference/mean": 0.04188045486807823,
65
+ "step": 2,
66
+ "step_time": 66.1825476239901
67
+ },
68
+ {
69
+ "cispo_clip_ratio": 0.5831195935606956,
70
+ "completions/clipped_ratio": 0.9375,
71
+ "completions/max_length": 6144.0,
72
+ "completions/max_terminated_length": 2244.0,
73
+ "completions/mean_length": 3197.3125,
74
+ "completions/mean_terminated_length": 2244.0,
75
+ "completions/min_length": 1313.0,
76
+ "completions/min_terminated_length": 2244.0,
77
+ "entropy": 1.3326809257268906,
78
+ "epoch": 0.01107011070110701,
79
+ "frac_reward_zero_std": 0.0,
80
+ "grad_norm": 5.462636181619018e-05,
81
+ "learning_rate": 0.0002,
82
+ "loss": 0.00016624285490252078,
83
+ "num_tokens": 146995.0,
84
+ "reward": 0.4853614568710327,
85
+ "reward_std": 0.2525769770145416,
86
+ "rewards/humor_reward/mean": 0.4853614568710327,
87
+ "rewards/humor_reward/std": 0.252577006816864,
88
+ "sampling/importance_sampling_ratio/max": 0.022306665778160095,
89
+ "sampling/importance_sampling_ratio/mean": 0.001404650043696165,
90
+ "sampling/importance_sampling_ratio/min": 1.7064398505350908e-10,
91
+ "sampling/sampling_logp_difference/max": 3.054729700088501,
92
+ "sampling/sampling_logp_difference/mean": 0.0515315905213356,
93
+ "step": 3,
94
+ "step_time": 86.2583410259831
95
+ },
96
+ {
97
+ "cispo_clip_ratio": 0.5900484155863523,
98
+ "completions/clipped_ratio": 1.0,
99
+ "completions/max_length": 6144.0,
100
+ "completions/max_terminated_length": 0.0,
101
+ "completions/mean_length": 3705.3125,
102
+ "completions/mean_terminated_length": 0.0,
103
+ "completions/min_length": 1009.0,
104
+ "completions/min_terminated_length": 0.0,
105
+ "entropy": 1.533411294221878,
106
+ "epoch": 0.014760147601476014,
107
+ "frac_reward_zero_std": 0.0,
108
+ "grad_norm": 0.0004810348036698997,
109
+ "learning_rate": 0.0002,
110
+ "loss": -0.0014380415668711066,
111
+ "num_tokens": 210440.0,
112
+ "reward": 0.48688405752182007,
113
+ "reward_std": 0.2839711010456085,
114
+ "rewards/humor_reward/mean": 0.48688405752182007,
115
+ "rewards/humor_reward/std": 0.2839711010456085,
116
+ "sampling/importance_sampling_ratio/max": 0.21202650666236877,
117
+ "sampling/importance_sampling_ratio/mean": 0.013267980888485909,
118
+ "sampling/importance_sampling_ratio/min": 1.5972070888103929e-10,
119
+ "sampling/sampling_logp_difference/max": 3.0988059043884277,
120
+ "sampling/sampling_logp_difference/mean": 0.04585312306880951,
121
+ "step": 4,
122
+ "step_time": 83.59175038301328
123
+ },
124
+ {
125
+ "cispo_clip_ratio": 0.6057924032211304,
126
+ "completions/clipped_ratio": 1.0,
127
+ "completions/max_length": 6144.0,
128
+ "completions/max_terminated_length": 0.0,
129
+ "completions/mean_length": 4057.8125,
130
+ "completions/mean_terminated_length": 0.0,
131
+ "completions/min_length": 482.0,
132
+ "completions/min_terminated_length": 0.0,
133
+ "entropy": 1.3052150011062622,
134
+ "epoch": 0.01845018450184502,
135
+ "frac_reward_zero_std": 0.0,
136
+ "grad_norm": 4.325129793869564e-06,
137
+ "learning_rate": 0.0002,
138
+ "loss": -5.259538738755509e-05,
139
+ "num_tokens": 279525.0,
140
+ "reward": 0.409991055727005,
141
+ "reward_std": 0.27515894174575806,
142
+ "rewards/humor_reward/mean": 0.409991055727005,
143
+ "rewards/humor_reward/std": 0.27515894174575806,
144
+ "sampling/importance_sampling_ratio/max": 0.0020468428265303373,
145
+ "sampling/importance_sampling_ratio/mean": 0.0002199342561652884,
146
+ "sampling/importance_sampling_ratio/min": 4.834557643107473e-09,
147
+ "sampling/sampling_logp_difference/max": 1.898941993713379,
148
+ "sampling/sampling_logp_difference/mean": 0.04034310206770897,
149
+ "step": 5,
150
+ "step_time": 63.17914728100004
151
+ },
152
+ {
153
+ "cispo_clip_ratio": 0.51509914919734,
154
+ "completions/clipped_ratio": 1.0,
155
+ "completions/max_length": 6144.0,
156
+ "completions/max_terminated_length": 0.0,
157
+ "completions/mean_length": 4054.625,
158
+ "completions/mean_terminated_length": 0.0,
159
+ "completions/min_length": 106.0,
160
+ "completions/min_terminated_length": 0.0,
161
+ "entropy": 1.3227798640727997,
162
+ "epoch": 0.02214022140221402,
163
+ "frac_reward_zero_std": 0.0,
164
+ "grad_norm": 0.0003930912062060088,
165
+ "learning_rate": 0.0002,
166
+ "loss": -8.775031892582774e-05,
167
+ "num_tokens": 348111.0,
168
+ "reward": 0.47100865840911865,
169
+ "reward_std": 0.2912721335887909,
170
+ "rewards/humor_reward/mean": 0.47100865840911865,
171
+ "rewards/humor_reward/std": 0.2912721633911133,
172
+ "sampling/importance_sampling_ratio/max": 0.26060751080513,
173
+ "sampling/importance_sampling_ratio/mean": 0.0164032019674778,
174
+ "sampling/importance_sampling_ratio/min": 9.639339326739105e-10,
175
+ "sampling/sampling_logp_difference/max": 2.0555481910705566,
176
+ "sampling/sampling_logp_difference/mean": 0.039414241909980774,
177
+ "step": 6,
178
+ "step_time": 82.86594534001779
179
+ },
180
+ {
181
+ "cispo_clip_ratio": 0.6096626706421375,
182
+ "completions/clipped_ratio": 0.875,
183
+ "completions/max_length": 6144.0,
184
+ "completions/max_terminated_length": 3121.0,
185
+ "completions/mean_length": 3594.3125,
186
+ "completions/mean_terminated_length": 1685.5,
187
+ "completions/min_length": 184.0,
188
+ "completions/min_terminated_length": 250.0,
189
+ "entropy": 1.4805490970611572,
190
+ "epoch": 0.025830258302583026,
191
+ "frac_reward_zero_std": 0.0,
192
+ "grad_norm": 0.0001002174976747483,
193
+ "learning_rate": 0.0002,
194
+ "loss": -0.00036368955625221133,
195
+ "num_tokens": 409556.0,
196
+ "reward": 0.25648510456085205,
197
+ "reward_std": 0.21258896589279175,
198
+ "rewards/humor_reward/mean": 0.25648510456085205,
199
+ "rewards/humor_reward/std": 0.21258896589279175,
200
+ "sampling/importance_sampling_ratio/max": 0.042766377329826355,
201
+ "sampling/importance_sampling_ratio/mean": 0.003916537389159203,
202
+ "sampling/importance_sampling_ratio/min": 3.151466310136186e-10,
203
+ "sampling/sampling_logp_difference/max": 2.511254072189331,
204
+ "sampling/sampling_logp_difference/mean": 0.04306597262620926,
205
+ "step": 7,
206
+ "step_time": 77.88916695199441
207
+ },
208
+ {
209
+ "cispo_clip_ratio": 0.4443662669509649,
210
+ "completions/clipped_ratio": 1.0,
211
+ "completions/max_length": 6144.0,
212
+ "completions/max_terminated_length": 0.0,
213
+ "completions/mean_length": 4063.8125,
214
+ "completions/mean_terminated_length": 0.0,
215
+ "completions/min_length": 426.0,
216
+ "completions/min_terminated_length": 0.0,
217
+ "entropy": 1.275453269481659,
218
+ "epoch": 0.02952029520295203,
219
+ "frac_reward_zero_std": 0.0,
220
+ "grad_norm": 1.6524854800081812e-05,
221
+ "learning_rate": 0.0002,
222
+ "loss": 6.745757855242118e-05,
223
+ "num_tokens": 478513.0,
224
+ "reward": 0.3238402009010315,
225
+ "reward_std": 0.2930999994277954,
226
+ "rewards/humor_reward/mean": 0.3238402009010315,
227
+ "rewards/humor_reward/std": 0.2931000292301178,
228
+ "sampling/importance_sampling_ratio/max": 0.010375253856182098,
229
+ "sampling/importance_sampling_ratio/mean": 0.0010308868950232863,
230
+ "sampling/importance_sampling_ratio/min": 5.8848907968922504e-08,
231
+ "sampling/sampling_logp_difference/max": 3.019871711730957,
232
+ "sampling/sampling_logp_difference/mean": 0.038398630917072296,
233
+ "step": 8,
234
+ "step_time": 62.586684064008296
235
+ },
236
+ {
237
+ "cispo_clip_ratio": 0.4614889621734619,
238
+ "completions/clipped_ratio": 1.0,
239
+ "completions/max_length": 6144.0,
240
+ "completions/max_terminated_length": 0.0,
241
+ "completions/mean_length": 3351.5,
242
+ "completions/mean_terminated_length": 0.0,
243
+ "completions/min_length": 925.0,
244
+ "completions/min_terminated_length": 0.0,
245
+ "entropy": 1.392012134194374,
246
+ "epoch": 0.033210332103321034,
247
+ "frac_reward_zero_std": 0.0,
248
+ "grad_norm": 5.4697693485650234e-08,
249
+ "learning_rate": 0.0002,
250
+ "loss": 3.4436376949997793e-07,
251
+ "num_tokens": 536969.0,
252
+ "reward": 0.320010781288147,
253
+ "reward_std": 0.26967623829841614,
254
+ "rewards/humor_reward/mean": 0.320010781288147,
255
+ "rewards/humor_reward/std": 0.26967623829841614,
256
+ "sampling/importance_sampling_ratio/max": 2.8325872335699387e-05,
257
+ "sampling/importance_sampling_ratio/mean": 4.4159696699352935e-06,
258
+ "sampling/importance_sampling_ratio/min": 4.985558949184565e-13,
259
+ "sampling/sampling_logp_difference/max": 2.7537732124328613,
260
+ "sampling/sampling_logp_difference/mean": 0.05181171000003815,
261
+ "step": 9,
262
+ "step_time": 85.44417907499883
263
+ },
264
+ {
265
+ "cispo_clip_ratio": 0.46727752313017845,
266
+ "completions/clipped_ratio": 1.0,
267
+ "completions/max_length": 6144.0,
268
+ "completions/max_terminated_length": 0.0,
269
+ "completions/mean_length": 4545.25,
270
+ "completions/mean_terminated_length": 0.0,
271
+ "completions/min_length": 1297.0,
272
+ "completions/min_terminated_length": 0.0,
273
+ "entropy": 1.277551643550396,
274
+ "epoch": 0.03690036900369004,
275
+ "frac_reward_zero_std": 0.0,
276
+ "grad_norm": 0.0018553344998508692,
277
+ "learning_rate": 0.0002,
278
+ "loss": -0.00018077026470564306,
279
+ "num_tokens": 613629.0,
280
+ "reward": 0.3976645767688751,
281
+ "reward_std": 0.3508976995944977,
282
+ "rewards/humor_reward/mean": 0.3976645767688751,
283
+ "rewards/humor_reward/std": 0.3508976995944977,
284
+ "sampling/importance_sampling_ratio/max": 0.32763394713401794,
285
+ "sampling/importance_sampling_ratio/mean": 0.02203518897294998,
286
+ "sampling/importance_sampling_ratio/min": 2.647046748460724e-10,
287
+ "sampling/sampling_logp_difference/max": 3.727196216583252,
288
+ "sampling/sampling_logp_difference/mean": 0.037366170436143875,
289
+ "step": 10,
290
+ "step_time": 83.16811528199469
291
+ },
292
+ {
293
+ "cispo_clip_ratio": 0.564013222232461,
294
+ "completions/clipped_ratio": 1.0,
295
+ "completions/max_length": 6144.0,
296
+ "completions/max_terminated_length": 0.0,
297
+ "completions/mean_length": 4322.8125,
298
+ "completions/mean_terminated_length": 0.0,
299
+ "completions/min_length": 705.0,
300
+ "completions/min_terminated_length": 0.0,
301
+ "entropy": 1.0119460746645927,
302
+ "epoch": 0.04059040590405904,
303
+ "frac_reward_zero_std": 0.0,
304
+ "grad_norm": 0.0012440788559615612,
305
+ "learning_rate": 0.0002,
306
+ "loss": -0.0035373736172914505,
307
+ "num_tokens": 687402.0,
308
+ "reward": 0.378947377204895,
309
+ "reward_std": 0.2340850830078125,
310
+ "rewards/humor_reward/mean": 0.378947377204895,
311
+ "rewards/humor_reward/std": 0.2340850979089737,
312
+ "sampling/importance_sampling_ratio/max": 0.30296310782432556,
313
+ "sampling/importance_sampling_ratio/mean": 0.018969321623444557,
314
+ "sampling/importance_sampling_ratio/min": 8.10443156811988e-15,
315
+ "sampling/sampling_logp_difference/max": 3.91379976272583,
316
+ "sampling/sampling_logp_difference/mean": 0.04132331907749176,
317
+ "step": 11,
318
+ "step_time": 94.6180467310187
319
+ },
320
+ {
321
+ "cispo_clip_ratio": 0.3971639759838581,
322
+ "completions/clipped_ratio": 1.0,
323
+ "completions/max_length": 6144.0,
324
+ "completions/max_terminated_length": 0.0,
325
+ "completions/mean_length": 5086.0,
326
+ "completions/mean_terminated_length": 0.0,
327
+ "completions/min_length": 1271.0,
328
+ "completions/min_terminated_length": 0.0,
329
+ "entropy": 1.0960774347186089,
330
+ "epoch": 0.04428044280442804,
331
+ "frac_reward_zero_std": 0.0,
332
+ "grad_norm": 5.153976962901652e-05,
333
+ "learning_rate": 0.0002,
334
+ "loss": -0.00044693646486848593,
335
+ "num_tokens": 772714.0,
336
+ "reward": 0.32133913040161133,
337
+ "reward_std": 0.380583792924881,
338
+ "rewards/humor_reward/mean": 0.32133913040161133,
339
+ "rewards/humor_reward/std": 0.38058382272720337,
340
+ "sampling/importance_sampling_ratio/max": 0.017419712617993355,
341
+ "sampling/importance_sampling_ratio/mean": 0.001832809066399932,
342
+ "sampling/importance_sampling_ratio/min": 6.273771866599498e-12,
343
+ "sampling/sampling_logp_difference/max": 5.7523579597473145,
344
+ "sampling/sampling_logp_difference/mean": 0.034892238676548004,
345
+ "step": 12,
346
+ "step_time": 79.48709479898389
347
+ },
348
+ {
349
+ "cispo_clip_ratio": 0.5508307814598083,
350
+ "completions/clipped_ratio": 1.0,
351
+ "completions/max_length": 6144.0,
352
+ "completions/max_terminated_length": 0.0,
353
+ "completions/mean_length": 4785.875,
354
+ "completions/mean_terminated_length": 0.0,
355
+ "completions/min_length": 1482.0,
356
+ "completions/min_terminated_length": 0.0,
357
+ "entropy": 1.299708716571331,
358
+ "epoch": 0.04797047970479705,
359
+ "frac_reward_zero_std": 0.0,
360
+ "grad_norm": 6.673526968370425e-06,
361
+ "learning_rate": 0.0002,
362
+ "loss": -2.0674373445217498e-05,
363
+ "num_tokens": 853224.0,
364
+ "reward": 0.36821186542510986,
365
+ "reward_std": 0.31694290041923523,
366
+ "rewards/humor_reward/mean": 0.36821186542510986,
367
+ "rewards/humor_reward/std": 0.3169429302215576,
368
+ "sampling/importance_sampling_ratio/max": 0.0009282280807383358,
369
+ "sampling/importance_sampling_ratio/mean": 0.0001542143290862441,
370
+ "sampling/importance_sampling_ratio/min": 4.797664487909969e-10,
371
+ "sampling/sampling_logp_difference/max": 2.7313647270202637,
372
+ "sampling/sampling_logp_difference/mean": 0.03830663487315178,
373
+ "step": 13,
374
+ "step_time": 82.41276069695596
375
+ },
376
+ {
377
+ "cispo_clip_ratio": 0.3803188279271126,
378
+ "completions/clipped_ratio": 0.9375,
379
+ "completions/max_length": 6144.0,
380
+ "completions/max_terminated_length": 1625.0,
381
+ "completions/mean_length": 4955.6875,
382
+ "completions/mean_terminated_length": 1625.0,
383
+ "completions/min_length": 968.0,
384
+ "completions/min_terminated_length": 1625.0,
385
+ "entropy": 1.364896483719349,
386
+ "epoch": 0.05166051660516605,
387
+ "frac_reward_zero_std": 0.0,
388
+ "grad_norm": 1.3385028978518676e-05,
389
+ "learning_rate": 0.0002,
390
+ "loss": 2.8572056180564687e-05,
391
+ "num_tokens": 937123.0,
392
+ "reward": 0.2386319488286972,
393
+ "reward_std": 0.28499212861061096,
394
+ "rewards/humor_reward/mean": 0.2386319488286972,
395
+ "rewards/humor_reward/std": 0.28499215841293335,
396
+ "sampling/importance_sampling_ratio/max": 0.009222447872161865,
397
+ "sampling/importance_sampling_ratio/mean": 0.0005764853558503091,
398
+ "sampling/importance_sampling_ratio/min": 1.6275064690298314e-15,
399
+ "sampling/sampling_logp_difference/max": 4.84999942779541,
400
+ "sampling/sampling_logp_difference/mean": 0.04662536829710007,
401
+ "step": 14,
402
+ "step_time": 85.49535570498847
403
+ },
404
+ {
405
+ "cispo_clip_ratio": 0.3275146186351776,
406
+ "completions/clipped_ratio": 0.875,
407
+ "completions/max_length": 6144.0,
408
+ "completions/max_terminated_length": 2590.0,
409
+ "completions/mean_length": 4530.75,
410
+ "completions/mean_terminated_length": 2166.5,
411
+ "completions/min_length": 505.0,
412
+ "completions/min_terminated_length": 1743.0,
413
+ "entropy": 0.9317370727658272,
414
+ "epoch": 0.055350553505535055,
415
+ "frac_reward_zero_std": 0.0,
416
+ "grad_norm": 4.116483614780009e-05,
417
+ "learning_rate": 0.0002,
418
+ "loss": 0.0002636128047015518,
419
+ "num_tokens": 1013327.0,
420
+ "reward": 0.2526451349258423,
421
+ "reward_std": 0.36080095171928406,
422
+ "rewards/humor_reward/mean": 0.2526451349258423,
423
+ "rewards/humor_reward/std": 0.36080095171928406,
424
+ "sampling/importance_sampling_ratio/max": 0.011370004154741764,
425
+ "sampling/importance_sampling_ratio/mean": 0.001944657531566918,
426
+ "sampling/importance_sampling_ratio/min": 1.1834137185820492e-12,
427
+ "sampling/sampling_logp_difference/max": 3.4003725051879883,
428
+ "sampling/sampling_logp_difference/mean": 0.03138995170593262,
429
+ "step": 15,
430
+ "step_time": 73.45879589200194
431
+ },
432
+ {
433
+ "cispo_clip_ratio": 0.1871738387271762,
434
+ "completions/clipped_ratio": 1.0,
435
+ "completions/max_length": 6144.0,
436
+ "completions/max_terminated_length": 0.0,
437
+ "completions/mean_length": 5333.125,
438
+ "completions/mean_terminated_length": 0.0,
439
+ "completions/min_length": 854.0,
440
+ "completions/min_terminated_length": 0.0,
441
+ "entropy": 1.1076993495225906,
442
+ "epoch": 0.05904059040590406,
443
+ "frac_reward_zero_std": 0.0,
444
+ "grad_norm": 0.0002949201443698257,
445
+ "learning_rate": 0.0002,
446
+ "loss": 0.0002496525994502008,
447
+ "num_tokens": 1102593.0,
448
+ "reward": 0.12923595309257507,
449
+ "reward_std": 0.24049529433250427,
450
+ "rewards/humor_reward/mean": 0.12923595309257507,
451
+ "rewards/humor_reward/std": 0.24049529433250427,
452
+ "sampling/importance_sampling_ratio/max": 0.13862203061580658,
453
+ "sampling/importance_sampling_ratio/mean": 0.013717259280383587,
454
+ "sampling/importance_sampling_ratio/min": 2.0222361563071445e-09,
455
+ "sampling/sampling_logp_difference/max": 1.9613001346588135,
456
+ "sampling/sampling_logp_difference/mean": 0.02957136556506157,
457
+ "step": 16,
458
+ "step_time": 76.10329114498745
459
+ },
460
+ {
461
+ "cispo_clip_ratio": 0.2707713171839714,
462
+ "completions/clipped_ratio": 1.0,
463
+ "completions/max_length": 6144.0,
464
+ "completions/max_terminated_length": 0.0,
465
+ "completions/mean_length": 5502.0,
466
+ "completions/mean_terminated_length": 0.0,
467
+ "completions/min_length": 2296.0,
468
+ "completions/min_terminated_length": 0.0,
469
+ "entropy": 1.3415134847164154,
470
+ "epoch": 0.06273062730627306,
471
+ "frac_reward_zero_std": 0.0,
472
+ "grad_norm": 1.443507812837197e-06,
473
+ "learning_rate": 0.0002,
474
+ "loss": 1.1968148101004772e-05,
475
+ "num_tokens": 1194337.0,
476
+ "reward": 0.14995136857032776,
477
+ "reward_std": 0.2515668570995331,
478
+ "rewards/humor_reward/mean": 0.14995136857032776,
479
+ "rewards/humor_reward/std": 0.25156688690185547,
480
+ "sampling/importance_sampling_ratio/max": 0.00044680986320599914,
481
+ "sampling/importance_sampling_ratio/mean": 3.927269790438004e-05,
482
+ "sampling/importance_sampling_ratio/min": 9.991393268293791e-10,
483
+ "sampling/sampling_logp_difference/max": 3.0433225631713867,
484
+ "sampling/sampling_logp_difference/mean": 0.033172741532325745,
485
+ "step": 17,
486
+ "step_time": 76.20640446299512
487
+ },
488
+ {
489
+ "cispo_clip_ratio": 0.4475329667329788,
490
+ "completions/clipped_ratio": 1.0,
491
+ "completions/max_length": 6144.0,
492
+ "completions/max_terminated_length": 0.0,
493
+ "completions/mean_length": 4663.4375,
494
+ "completions/mean_terminated_length": 0.0,
495
+ "completions/min_length": 1583.0,
496
+ "completions/min_terminated_length": 0.0,
497
+ "entropy": 1.0032905638217926,
498
+ "epoch": 0.06642066420664207,
499
+ "frac_reward_zero_std": 0.0,
500
+ "grad_norm": 4.456151003751074e-08,
501
+ "learning_rate": 0.0002,
502
+ "loss": 1.6434560734523984e-07,
503
+ "num_tokens": 1273560.0,
504
+ "reward": 0.47607260942459106,
505
+ "reward_std": 0.3206636905670166,
506
+ "rewards/humor_reward/mean": 0.47607260942459106,
507
+ "rewards/humor_reward/std": 0.3206636905670166,
508
+ "sampling/importance_sampling_ratio/max": 1.8386488591204397e-05,
509
+ "sampling/importance_sampling_ratio/mean": 2.1001696950406767e-06,
510
+ "sampling/importance_sampling_ratio/min": 2.987033566998441e-14,
511
+ "sampling/sampling_logp_difference/max": 2.9727420806884766,
512
+ "sampling/sampling_logp_difference/mean": 0.04333854466676712,
513
+ "step": 18,
514
+ "step_time": 91.44482385799347
515
+ },
516
+ {
517
+ "cispo_clip_ratio": 0.5934446323662996,
518
+ "completions/clipped_ratio": 1.0,
519
+ "completions/max_length": 6144.0,
520
+ "completions/max_terminated_length": 0.0,
521
+ "completions/mean_length": 4332.0,
522
+ "completions/mean_terminated_length": 0.0,
523
+ "completions/min_length": 692.0,
524
+ "completions/min_terminated_length": 0.0,
525
+ "entropy": 1.2455691695213318,
526
+ "epoch": 0.07011070110701106,
527
+ "frac_reward_zero_std": 0.0,
528
+ "grad_norm": 2.6948493541567586e-05,
529
+ "learning_rate": 0.0002,
530
+ "loss": 0.00015589207760058343,
531
+ "num_tokens": 1346808.0,
532
+ "reward": 0.4407455325126648,
533
+ "reward_std": 0.32844167947769165,
534
+ "rewards/humor_reward/mean": 0.4407455325126648,
535
+ "rewards/humor_reward/std": 0.32844167947769165,
536
+ "sampling/importance_sampling_ratio/max": 0.0622418075799942,
537
+ "sampling/importance_sampling_ratio/mean": 0.00455310195684433,
538
+ "sampling/importance_sampling_ratio/min": 6.87965170942384e-11,
539
+ "sampling/sampling_logp_difference/max": 1.9999836683273315,
540
+ "sampling/sampling_logp_difference/mean": 0.040162429213523865,
541
+ "step": 19,
542
+ "step_time": 63.66296142600186
543
+ },
544
+ {
545
+ "cispo_clip_ratio": 0.3904235363006592,
546
+ "completions/clipped_ratio": 0.9375,
547
+ "completions/max_length": 6144.0,
548
+ "completions/max_terminated_length": 3539.0,
549
+ "completions/mean_length": 4865.125,
550
+ "completions/mean_terminated_length": 3539.0,
551
+ "completions/min_length": 1705.0,
552
+ "completions/min_terminated_length": 3539.0,
553
+ "entropy": 1.1548645570874214,
554
+ "epoch": 0.07380073800738007,
555
+ "frac_reward_zero_std": 0.0,
556
+ "grad_norm": 0.0003063328331336379,
557
+ "learning_rate": 0.0002,
558
+ "loss": 0.0016583555843681097,
559
+ "num_tokens": 1428810.0,
560
+ "reward": 0.32657718658447266,
561
+ "reward_std": 0.3702032268047333,
562
+ "rewards/humor_reward/mean": 0.32657718658447266,
563
+ "rewards/humor_reward/std": 0.37020325660705566,
564
+ "sampling/importance_sampling_ratio/max": 0.07926318794488907,
565
+ "sampling/importance_sampling_ratio/mean": 0.00835469737648964,
566
+ "sampling/importance_sampling_ratio/min": 1.397608049256982e-12,
567
+ "sampling/sampling_logp_difference/max": 3.58780574798584,
568
+ "sampling/sampling_logp_difference/mean": 0.036592792719602585,
569
+ "step": 20,
570
+ "step_time": 80.32529347503441
571
+ },
572
+ {
573
+ "cispo_clip_ratio": 0.46490118466317654,
574
+ "completions/clipped_ratio": 1.0,
575
+ "completions/max_length": 6144.0,
576
+ "completions/max_terminated_length": 0.0,
577
+ "completions/mean_length": 4817.375,
578
+ "completions/mean_terminated_length": 0.0,
579
+ "completions/min_length": 1635.0,
580
+ "completions/min_terminated_length": 0.0,
581
+ "entropy": 1.468906119465828,
582
+ "epoch": 0.07749077490774908,
583
+ "frac_reward_zero_std": 0.0,
584
+ "grad_norm": 9.814787915729539e-08,
585
+ "learning_rate": 0.0002,
586
+ "loss": -6.973982635827269e-07,
587
+ "num_tokens": 1510048.0,
588
+ "reward": 0.4167077839374542,
589
+ "reward_std": 0.3755662441253662,
590
+ "rewards/humor_reward/mean": 0.4167077839374542,
591
+ "rewards/humor_reward/std": 0.3755662441253662,
592
+ "sampling/importance_sampling_ratio/max": 2.670207504706923e-05,
593
+ "sampling/importance_sampling_ratio/mean": 3.46238084603101e-06,
594
+ "sampling/importance_sampling_ratio/min": 6.556125781154165e-11,
595
+ "sampling/sampling_logp_difference/max": 2.0220751762390137,
596
+ "sampling/sampling_logp_difference/mean": 0.04112354293465614,
597
+ "step": 21,
598
+ "step_time": 66.39175808998698
599
+ },
600
+ {
601
+ "cispo_clip_ratio": 0.4479520544409752,
602
+ "completions/clipped_ratio": 1.0,
603
+ "completions/max_length": 6144.0,
604
+ "completions/max_terminated_length": 0.0,
605
+ "completions/mean_length": 4476.625,
606
+ "completions/mean_terminated_length": 0.0,
607
+ "completions/min_length": 576.0,
608
+ "completions/min_terminated_length": 0.0,
609
+ "entropy": 1.3538827523589134,
610
+ "epoch": 0.08118081180811808,
611
+ "frac_reward_zero_std": 0.0,
612
+ "grad_norm": 9.053791472979356e-06,
613
+ "learning_rate": 0.0002,
614
+ "loss": -4.3639585783239454e-05,
615
+ "num_tokens": 1585386.0,
616
+ "reward": 0.398086816072464,
617
+ "reward_std": 0.39999622106552124,
618
+ "rewards/humor_reward/mean": 0.398086816072464,
619
+ "rewards/humor_reward/std": 0.39999625086784363,
620
+ "sampling/importance_sampling_ratio/max": 0.011358072981238365,
621
+ "sampling/importance_sampling_ratio/mean": 0.0010091899894177914,
622
+ "sampling/importance_sampling_ratio/min": 4.4679931976432385e-10,
623
+ "sampling/sampling_logp_difference/max": 3.5639562606811523,
624
+ "sampling/sampling_logp_difference/mean": 0.035584691911935806,
625
+ "step": 22,
626
+ "step_time": 77.29983079498925
627
+ },
628
+ {
629
+ "cispo_clip_ratio": 0.5936666019260883,
630
+ "completions/clipped_ratio": 1.0,
631
+ "completions/max_length": 6144.0,
632
+ "completions/max_terminated_length": 0.0,
633
+ "completions/mean_length": 4713.3125,
634
+ "completions/mean_terminated_length": 0.0,
635
+ "completions/min_length": 701.0,
636
+ "completions/min_terminated_length": 0.0,
637
+ "entropy": 1.1694510877132416,
638
+ "epoch": 0.08487084870848709,
639
+ "frac_reward_zero_std": 0.0,
640
+ "grad_norm": 0.00025732198264449835,
641
+ "learning_rate": 0.0002,
642
+ "loss": 0.00032361538615077734,
643
+ "num_tokens": 1664511.0,
644
+ "reward": 0.3887137174606323,
645
+ "reward_std": 0.35025978088378906,
646
+ "rewards/humor_reward/mean": 0.3887137174606323,
647
+ "rewards/humor_reward/std": 0.35025978088378906,
648
+ "sampling/importance_sampling_ratio/max": 0.2608579397201538,
649
+ "sampling/importance_sampling_ratio/mean": 0.016369398683309555,
650
+ "sampling/importance_sampling_ratio/min": 5.653824977636113e-11,
651
+ "sampling/sampling_logp_difference/max": 1.6544189453125,
652
+ "sampling/sampling_logp_difference/mean": 0.03655308857560158,
653
+ "step": 23,
654
+ "step_time": 79.8481750869978
655
+ },
656
+ {
657
+ "cispo_clip_ratio": 0.46904783695936203,
658
+ "completions/clipped_ratio": 1.0,
659
+ "completions/max_length": 6144.0,
660
+ "completions/max_terminated_length": 0.0,
661
+ "completions/mean_length": 4739.25,
662
+ "completions/mean_terminated_length": 0.0,
663
+ "completions/min_length": 2132.0,
664
+ "completions/min_terminated_length": 0.0,
665
+ "entropy": 1.2212552726268768,
666
+ "epoch": 0.08856088560885608,
667
+ "frac_reward_zero_std": 0.0,
668
+ "grad_norm": 2.1028017727076076e-06,
669
+ "learning_rate": 0.0002,
670
+ "loss": 5.860923010914121e-06,
671
+ "num_tokens": 1744499.0,
672
+ "reward": 0.33275866508483887,
673
+ "reward_std": 0.3678642511367798,
674
+ "rewards/humor_reward/mean": 0.33275866508483887,
675
+ "rewards/humor_reward/std": 0.3678642809391022,
676
+ "sampling/importance_sampling_ratio/max": 0.0013681778218597174,
677
+ "sampling/importance_sampling_ratio/mean": 9.336201765108854e-05,
678
+ "sampling/importance_sampling_ratio/min": 2.4985560842516463e-10,
679
+ "sampling/sampling_logp_difference/max": 1.9406442642211914,
680
+ "sampling/sampling_logp_difference/mean": 0.03511609137058258,
681
+ "step": 24,
682
+ "step_time": 79.88214839099965
683
+ },
684
+ {
685
+ "cispo_clip_ratio": 0.4411753863096237,
686
+ "completions/clipped_ratio": 1.0,
687
+ "completions/max_length": 6144.0,
688
+ "completions/max_terminated_length": 0.0,
689
+ "completions/mean_length": 4651.4375,
690
+ "completions/mean_terminated_length": 0.0,
691
+ "completions/min_length": 1004.0,
692
+ "completions/min_terminated_length": 0.0,
693
+ "entropy": 0.8522009998559952,
694
+ "epoch": 0.09225092250922509,
695
+ "frac_reward_zero_std": 0.0,
696
+ "grad_norm": 0.00017764404765330255,
697
+ "learning_rate": 0.0002,
698
+ "loss": 0.0008231036481447518,
699
+ "num_tokens": 1823082.0,
700
+ "reward": 0.274623304605484,
701
+ "reward_std": 0.3160068690776825,
702
+ "rewards/humor_reward/mean": 0.274623304605484,
703
+ "rewards/humor_reward/std": 0.3160068988800049,
704
+ "sampling/importance_sampling_ratio/max": 0.03566610440611839,
705
+ "sampling/importance_sampling_ratio/mean": 0.0037315732333809137,
706
+ "sampling/importance_sampling_ratio/min": 4.288159161092153e-09,
707
+ "sampling/sampling_logp_difference/max": 2.191567897796631,
708
+ "sampling/sampling_logp_difference/mean": 0.027100346982479095,
709
+ "step": 25,
710
+ "step_time": 65.47632334999798
711
+ },
712
+ {
713
+ "cispo_clip_ratio": 0.3125321865081787,
714
+ "completions/clipped_ratio": 1.0,
715
+ "completions/max_length": 6144.0,
716
+ "completions/max_terminated_length": 0.0,
717
+ "completions/mean_length": 5114.5625,
718
+ "completions/mean_terminated_length": 0.0,
719
+ "completions/min_length": 354.0,
720
+ "completions/min_terminated_length": 0.0,
721
+ "entropy": 0.8008184731006622,
722
+ "epoch": 0.0959409594095941,
723
+ "frac_reward_zero_std": 0.0,
724
+ "grad_norm": 0.00044094581971876323,
725
+ "learning_rate": 0.0002,
726
+ "loss": -0.0023368855472654104,
727
+ "num_tokens": 1909075.0,
728
+ "reward": 0.2744894027709961,
729
+ "reward_std": 0.3853631615638733,
730
+ "rewards/humor_reward/mean": 0.2744894027709961,
731
+ "rewards/humor_reward/std": 0.3853631913661957,
732
+ "sampling/importance_sampling_ratio/max": 0.27399709820747375,
733
+ "sampling/importance_sampling_ratio/mean": 0.02907378226518631,
734
+ "sampling/importance_sampling_ratio/min": 1.0831848751280404e-09,
735
+ "sampling/sampling_logp_difference/max": 2.079195022583008,
736
+ "sampling/sampling_logp_difference/mean": 0.027823781594634056,
737
+ "step": 26,
738
+ "step_time": 79.29085839301115
739
+ },
740
+ {
741
+ "cispo_clip_ratio": 0.45367857813835144,
742
+ "completions/clipped_ratio": 1.0,
743
+ "completions/max_length": 6144.0,
744
+ "completions/max_terminated_length": 0.0,
745
+ "completions/mean_length": 4420.875,
746
+ "completions/mean_terminated_length": 0.0,
747
+ "completions/min_length": 1406.0,
748
+ "completions/min_terminated_length": 0.0,
749
+ "entropy": 1.1720404550433159,
750
+ "epoch": 0.0996309963099631,
751
+ "frac_reward_zero_std": 0.0,
752
+ "grad_norm": 2.0236677300999872e-05,
753
+ "learning_rate": 0.0002,
754
+ "loss": -0.0002367804991081357,
755
+ "num_tokens": 1983073.0,
756
+ "reward": 0.37771666049957275,
757
+ "reward_std": 0.3381035327911377,
758
+ "rewards/humor_reward/mean": 0.37771666049957275,
759
+ "rewards/humor_reward/std": 0.3381035625934601,
760
+ "sampling/importance_sampling_ratio/max": 0.01442171260714531,
761
+ "sampling/importance_sampling_ratio/mean": 0.0017098486423492432,
762
+ "sampling/importance_sampling_ratio/min": 2.375326602077621e-08,
763
+ "sampling/sampling_logp_difference/max": 2.334380626678467,
764
+ "sampling/sampling_logp_difference/mean": 0.03178582713007927,
765
+ "step": 27,
766
+ "step_time": 75.20771357801277
767
+ },
768
+ {
769
+ "cispo_clip_ratio": 0.6514521557837725,
770
+ "completions/clipped_ratio": 0.9375,
771
+ "completions/max_length": 6144.0,
772
+ "completions/max_terminated_length": 5111.0,
773
+ "completions/mean_length": 3562.625,
774
+ "completions/mean_terminated_length": 5111.0,
775
+ "completions/min_length": 171.0,
776
+ "completions/min_terminated_length": 5111.0,
777
+ "entropy": 1.2078422084450722,
778
+ "epoch": 0.1033210332103321,
779
+ "frac_reward_zero_std": 0.0,
780
+ "grad_norm": 0.000864825677126646,
781
+ "learning_rate": 0.0002,
782
+ "loss": 0.0034890188835561275,
783
+ "num_tokens": 2044011.0,
784
+ "reward": 0.5999776124954224,
785
+ "reward_std": 0.378849059343338,
786
+ "rewards/humor_reward/mean": 0.5999776124954224,
787
+ "rewards/humor_reward/std": 0.3788490891456604,
788
+ "sampling/importance_sampling_ratio/max": 0.4461648166179657,
789
+ "sampling/importance_sampling_ratio/mean": 0.056391313672065735,
790
+ "sampling/importance_sampling_ratio/min": 1.9029215536647826e-08,
791
+ "sampling/sampling_logp_difference/max": 2.0776968002319336,
792
+ "sampling/sampling_logp_difference/mean": 0.04053337126970291,
793
+ "step": 28,
794
+ "step_time": 59.718489810984465
795
+ },
796
+ {
797
+ "cispo_clip_ratio": 0.5842036623507738,
798
+ "completions/clipped_ratio": 1.0,
799
+ "completions/max_length": 6144.0,
800
+ "completions/max_terminated_length": 0.0,
801
+ "completions/mean_length": 3761.1875,
802
+ "completions/mean_terminated_length": 0.0,
803
+ "completions/min_length": 1373.0,
804
+ "completions/min_terminated_length": 0.0,
805
+ "entropy": 0.9947392120957375,
806
+ "epoch": 0.1070110701107011,
807
+ "frac_reward_zero_std": 0.0,
808
+ "grad_norm": 5.404234002526209e-07,
809
+ "learning_rate": 0.0002,
810
+ "loss": 3.1678846426075324e-06,
811
+ "num_tokens": 2108798.0,
812
+ "reward": 0.5179843902587891,
813
+ "reward_std": 0.2485807240009308,
814
+ "rewards/humor_reward/mean": 0.5179843902587891,
815
+ "rewards/humor_reward/std": 0.2485807240009308,
816
+ "sampling/importance_sampling_ratio/max": 0.00023896525090094656,
817
+ "sampling/importance_sampling_ratio/mean": 1.7564820154802874e-05,
818
+ "sampling/importance_sampling_ratio/min": 8.362109155070005e-15,
819
+ "sampling/sampling_logp_difference/max": 3.4245944023132324,
820
+ "sampling/sampling_logp_difference/mean": 0.04190240800380707,
821
+ "step": 29,
822
+ "step_time": 92.04002382898761
823
+ },
824
+ {
825
+ "cispo_clip_ratio": 0.481358059681952,
826
+ "completions/clipped_ratio": 0.9375,
827
+ "completions/max_length": 6144.0,
828
+ "completions/max_terminated_length": 3548.0,
829
+ "completions/mean_length": 3230.25,
830
+ "completions/mean_terminated_length": 3548.0,
831
+ "completions/min_length": 142.0,
832
+ "completions/min_terminated_length": 3548.0,
833
+ "entropy": 0.9884351938962936,
834
+ "epoch": 0.11070110701107011,
835
+ "frac_reward_zero_std": 0.0,
836
+ "grad_norm": 0.0007160792592912912,
837
+ "learning_rate": 0.0002,
838
+ "loss": 0.0030249895062297583,
839
+ "num_tokens": 2165090.0,
840
+ "reward": 0.32606449723243713,
841
+ "reward_std": 0.2554178833961487,
842
+ "rewards/humor_reward/mean": 0.32606449723243713,
843
+ "rewards/humor_reward/std": 0.2554178833961487,
844
+ "sampling/importance_sampling_ratio/max": 0.19502675533294678,
845
+ "sampling/importance_sampling_ratio/mean": 0.01895805075764656,
846
+ "sampling/importance_sampling_ratio/min": 1.5065569625161146e-10,
847
+ "sampling/sampling_logp_difference/max": 2.431838035583496,
848
+ "sampling/sampling_logp_difference/mean": 0.04055629298090935,
849
+ "step": 30,
850
+ "step_time": 83.23436555001535
851
+ },
852
+ {
853
+ "cispo_clip_ratio": 0.6561583578586578,
854
+ "completions/clipped_ratio": 1.0,
855
+ "completions/max_length": 6144.0,
856
+ "completions/max_terminated_length": 0.0,
857
+ "completions/mean_length": 3374.625,
858
+ "completions/mean_terminated_length": 0.0,
859
+ "completions/min_length": 247.0,
860
+ "completions/min_terminated_length": 0.0,
861
+ "entropy": 1.0753171369433403,
862
+ "epoch": 0.11439114391143912,
863
+ "frac_reward_zero_std": 0.0,
864
+ "grad_norm": 0.0035273912362754345,
865
+ "learning_rate": 0.0002,
866
+ "loss": 0.01652080938220024,
867
+ "num_tokens": 2223020.0,
868
+ "reward": 0.5712062120437622,
869
+ "reward_std": 0.3373561203479767,
870
+ "rewards/humor_reward/mean": 0.5712062120437622,
871
+ "rewards/humor_reward/std": 0.3373561203479767,
872
+ "sampling/importance_sampling_ratio/max": 1.9402817487716675,
873
+ "sampling/importance_sampling_ratio/mean": 0.13337960839271545,
874
+ "sampling/importance_sampling_ratio/min": 1.6416909431882232e-07,
875
+ "sampling/sampling_logp_difference/max": 6.734940528869629,
876
+ "sampling/sampling_logp_difference/mean": 0.03378872573375702,
877
+ "step": 31,
878
+ "step_time": 59.33454087101563
879
+ },
880
+ {
881
+ "cispo_clip_ratio": 0.6109768375754356,
882
+ "completions/clipped_ratio": 1.0,
883
+ "completions/max_length": 6144.0,
884
+ "completions/max_terminated_length": 0.0,
885
+ "completions/mean_length": 3135.3125,
886
+ "completions/mean_terminated_length": 0.0,
887
+ "completions/min_length": 350.0,
888
+ "completions/min_terminated_length": 0.0,
889
+ "entropy": 1.203492984175682,
890
+ "epoch": 0.11808118081180811,
891
+ "frac_reward_zero_std": 0.0,
892
+ "grad_norm": 8.204303594538942e-05,
893
+ "learning_rate": 0.0002,
894
+ "loss": 0.0006842454895377159,
895
+ "num_tokens": 2277121.0,
896
+ "reward": 0.6346642971038818,
897
+ "reward_std": 0.2692297101020813,
898
+ "rewards/humor_reward/mean": 0.6346642971038818,
899
+ "rewards/humor_reward/std": 0.2692297399044037,
900
+ "sampling/importance_sampling_ratio/max": 1.2163432836532593,
901
+ "sampling/importance_sampling_ratio/mean": 0.07933147996664047,
902
+ "sampling/importance_sampling_ratio/min": 3.8786645006894105e-09,
903
+ "sampling/sampling_logp_difference/max": 1.4427013397216797,
904
+ "sampling/sampling_logp_difference/mean": 0.03805601969361305,
905
+ "step": 32,
906
+ "step_time": 56.60236855900439
907
+ },
908
+ {
909
+ "cispo_clip_ratio": 0.6497912313789129,
910
+ "completions/clipped_ratio": 0.875,
911
+ "completions/max_length": 6144.0,
912
+ "completions/max_terminated_length": 4153.0,
913
+ "completions/mean_length": 3000.5625,
914
+ "completions/mean_terminated_length": 2144.5,
915
+ "completions/min_length": 136.0,
916
+ "completions/min_terminated_length": 136.0,
917
+ "entropy": 1.1894647032022476,
918
+ "epoch": 0.12177121771217712,
919
+ "frac_reward_zero_std": 0.0,
920
+ "grad_norm": 0.00014119563275016844,
921
+ "learning_rate": 0.0002,
922
+ "loss": 0.0002433458430459723,
923
+ "num_tokens": 2329962.0,
924
+ "reward": 0.5449860095977783,
925
+ "reward_std": 0.3282092809677124,
926
+ "rewards/humor_reward/mean": 0.5449860095977783,
927
+ "rewards/humor_reward/std": 0.3282092809677124,
928
+ "sampling/importance_sampling_ratio/max": 0.08825313299894333,
929
+ "sampling/importance_sampling_ratio/mean": 0.006083119660615921,
930
+ "sampling/importance_sampling_ratio/min": 1.4223322519768544e-09,
931
+ "sampling/sampling_logp_difference/max": 2.7407069206237793,
932
+ "sampling/sampling_logp_difference/mean": 0.04315001145005226,
933
+ "step": 33,
934
+ "step_time": 87.83407789799094
935
+ },
936
+ {
937
+ "cispo_clip_ratio": 0.603145282715559,
938
+ "completions/clipped_ratio": 0.9375,
939
+ "completions/max_length": 5923.0,
940
+ "completions/max_terminated_length": 1476.0,
941
+ "completions/mean_length": 3187.3125,
942
+ "completions/mean_terminated_length": 1476.0,
943
+ "completions/min_length": 1006.0,
944
+ "completions/min_terminated_length": 1476.0,
945
+ "entropy": 1.1579975709319115,
946
+ "epoch": 0.12546125461254612,
947
+ "frac_reward_zero_std": 0.0,
948
+ "grad_norm": 5.3547359129879624e-05,
949
+ "learning_rate": 0.0002,
950
+ "loss": 0.0002505858719814569,
951
+ "num_tokens": 2385343.0,
952
+ "reward": 0.5027609467506409,
953
+ "reward_std": 0.23998978734016418,
954
+ "rewards/humor_reward/mean": 0.5027609467506409,
955
+ "rewards/humor_reward/std": 0.23998978734016418,
956
+ "sampling/importance_sampling_ratio/max": 0.03983650356531143,
957
+ "sampling/importance_sampling_ratio/mean": 0.003191877156496048,
958
+ "sampling/importance_sampling_ratio/min": 9.33344068698716e-09,
959
+ "sampling/sampling_logp_difference/max": 2.9324636459350586,
960
+ "sampling/sampling_logp_difference/mean": 0.04020746424794197,
961
+ "step": 34,
962
+ "step_time": 55.506949331000214
963
+ },
964
+ {
965
+ "cispo_clip_ratio": 0.6451264955103397,
966
+ "completions/clipped_ratio": 0.9375,
967
+ "completions/max_length": 5022.0,
968
+ "completions/max_terminated_length": 529.0,
969
+ "completions/mean_length": 2561.125,
970
+ "completions/mean_terminated_length": 529.0,
971
+ "completions/min_length": 529.0,
972
+ "completions/min_terminated_length": 529.0,
973
+ "entropy": 1.1418700069189072,
974
+ "epoch": 0.12915129151291513,
975
+ "frac_reward_zero_std": 0.0,
976
+ "grad_norm": 0.0015835068188607693,
977
+ "learning_rate": 0.0002,
978
+ "loss": -0.004869138356298208,
979
+ "num_tokens": 2430257.0,
980
+ "reward": 0.5386431217193604,
981
+ "reward_std": 0.27275699377059937,
982
+ "rewards/humor_reward/mean": 0.5386431217193604,
983
+ "rewards/humor_reward/std": 0.27275702357292175,
984
+ "sampling/importance_sampling_ratio/max": 0.4542810022830963,
985
+ "sampling/importance_sampling_ratio/mean": 0.035323094576597214,
986
+ "sampling/importance_sampling_ratio/min": 4.0660263350700276e-11,
987
+ "sampling/sampling_logp_difference/max": 17.207799911499023,
988
+ "sampling/sampling_logp_difference/mean": 0.038973793387413025,
989
+ "step": 35,
990
+ "step_time": 69.47620079400076
991
+ },
992
+ {
993
+ "cispo_clip_ratio": 0.6264413706958294,
994
+ "completions/clipped_ratio": 0.9375,
995
+ "completions/max_length": 6144.0,
996
+ "completions/max_terminated_length": 4404.0,
997
+ "completions/mean_length": 3681.5,
998
+ "completions/mean_terminated_length": 4404.0,
999
+ "completions/min_length": 524.0,
1000
+ "completions/min_terminated_length": 4404.0,
1001
+ "entropy": 1.4374851435422897,
1002
+ "epoch": 0.13284132841328414,
1003
+ "frac_reward_zero_std": 0.0,
1004
+ "grad_norm": 0.0009762643603608012,
1005
+ "learning_rate": 0.0002,
1006
+ "loss": 0.006543246563524008,
1007
+ "num_tokens": 2493321.0,
1008
+ "reward": 0.5082834959030151,
1009
+ "reward_std": 0.34550100564956665,
1010
+ "rewards/humor_reward/mean": 0.5082834959030151,
1011
+ "rewards/humor_reward/std": 0.34550103545188904,
1012
+ "sampling/importance_sampling_ratio/max": 0.3936307430267334,
1013
+ "sampling/importance_sampling_ratio/mean": 0.03291841596364975,
1014
+ "sampling/importance_sampling_ratio/min": 5.0612318780451915e-09,
1015
+ "sampling/sampling_logp_difference/max": 2.5245909690856934,
1016
+ "sampling/sampling_logp_difference/mean": 0.03988190367817879,
1017
+ "step": 36,
1018
+ "step_time": 59.509835421005846
1019
+ },
1020
+ {
1021
+ "cispo_clip_ratio": 0.6464070416986942,
1022
+ "completions/clipped_ratio": 1.0,
1023
+ "completions/max_length": 6144.0,
1024
+ "completions/max_terminated_length": 0.0,
1025
+ "completions/mean_length": 3666.25,
1026
+ "completions/mean_terminated_length": 0.0,
1027
+ "completions/min_length": 1304.0,
1028
+ "completions/min_terminated_length": 0.0,
1029
+ "entropy": 1.4860893040895462,
1030
+ "epoch": 0.13653136531365315,
1031
+ "frac_reward_zero_std": 0.0,
1032
+ "grad_norm": 0.0014742233324795961,
1033
+ "learning_rate": 0.0002,
1034
+ "loss": 0.009775211103260517,
1035
+ "num_tokens": 2555917.0,
1036
+ "reward": 0.5413703918457031,
1037
+ "reward_std": 0.2888968586921692,
1038
+ "rewards/humor_reward/mean": 0.5413703918457031,
1039
+ "rewards/humor_reward/std": 0.2888968884944916,
1040
+ "sampling/importance_sampling_ratio/max": 0.5927172899246216,
1041
+ "sampling/importance_sampling_ratio/mean": 0.03740416467189789,
1042
+ "sampling/importance_sampling_ratio/min": 0.0,
1043
+ "sampling/sampling_logp_difference/max": 2.5622243881225586,
1044
+ "sampling/sampling_logp_difference/mean": 0.03937282785773277,
1045
+ "step": 37,
1046
+ "step_time": 84.5253242639883
1047
+ },
1048
+ {
1049
+ "cispo_clip_ratio": 0.5759155452251434,
1050
+ "completions/clipped_ratio": 0.875,
1051
+ "completions/max_length": 5372.0,
1052
+ "completions/max_terminated_length": 3480.0,
1053
+ "completions/mean_length": 2918.75,
1054
+ "completions/mean_terminated_length": 2408.0,
1055
+ "completions/min_length": 964.0,
1056
+ "completions/min_terminated_length": 1336.0,
1057
+ "entropy": 1.4434982985258102,
1058
+ "epoch": 0.14022140221402213,
1059
+ "frac_reward_zero_std": 0.0,
1060
+ "grad_norm": 0.001353645813651383,
1061
+ "learning_rate": 0.0002,
1062
+ "loss": -0.005609482992440462,
1063
+ "num_tokens": 2606553.0,
1064
+ "reward": 0.27517157793045044,
1065
+ "reward_std": 0.17297060787677765,
1066
+ "rewards/humor_reward/mean": 0.27517157793045044,
1067
+ "rewards/humor_reward/std": 0.17297060787677765,
1068
+ "sampling/importance_sampling_ratio/max": 0.8227623105049133,
1069
+ "sampling/importance_sampling_ratio/mean": 0.08773483335971832,
1070
+ "sampling/importance_sampling_ratio/min": 3.3746889016583737e-07,
1071
+ "sampling/sampling_logp_difference/max": 1.4419441223144531,
1072
+ "sampling/sampling_logp_difference/mean": 0.04054056107997894,
1073
+ "step": 38,
1074
+ "step_time": 70.05384999897797
1075
+ },
1076
+ {
1077
+ "cispo_clip_ratio": 0.47585512325167656,
1078
+ "completions/clipped_ratio": 1.0,
1079
+ "completions/max_length": 3937.0,
1080
+ "completions/max_terminated_length": 0.0,
1081
+ "completions/mean_length": 2637.875,
1082
+ "completions/mean_terminated_length": 0.0,
1083
+ "completions/min_length": 631.0,
1084
+ "completions/min_terminated_length": 0.0,
1085
+ "entropy": 1.9437123090028763,
1086
+ "epoch": 0.14391143911439114,
1087
+ "frac_reward_zero_std": 0.0,
1088
+ "grad_norm": 0.0001363922783639282,
1089
+ "learning_rate": 0.0002,
1090
+ "loss": 0.001440045889467001,
1091
+ "num_tokens": 2652695.0,
1092
+ "reward": 0.5775443911552429,
1093
+ "reward_std": 0.1908194124698639,
1094
+ "rewards/humor_reward/mean": 0.5775443911552429,
1095
+ "rewards/humor_reward/std": 0.1908194124698639,
1096
+ "sampling/importance_sampling_ratio/max": 0.04935026168823242,
1097
+ "sampling/importance_sampling_ratio/mean": 0.003360162954777479,
1098
+ "sampling/importance_sampling_ratio/min": 9.852194615689314e-09,
1099
+ "sampling/sampling_logp_difference/max": 2.8174242973327637,
1100
+ "sampling/sampling_logp_difference/mean": 0.04766388610005379,
1101
+ "step": 39,
1102
+ "step_time": 65.56160380599613
1103
+ },
1104
+ {
1105
+ "cispo_clip_ratio": 0.5537771657109261,
1106
+ "completions/clipped_ratio": 0.875,
1107
+ "completions/max_length": 6144.0,
1108
+ "completions/max_terminated_length": 3334.0,
1109
+ "completions/mean_length": 3318.3125,
1110
+ "completions/mean_terminated_length": 2593.5,
1111
+ "completions/min_length": 1005.0,
1112
+ "completions/min_terminated_length": 1853.0,
1113
+ "entropy": 1.6230905801057816,
1114
+ "epoch": 0.14760147601476015,
1115
+ "frac_reward_zero_std": 0.0,
1116
+ "grad_norm": 8.798116323305294e-05,
1117
+ "learning_rate": 0.0002,
1118
+ "loss": 0.0004228673642501235,
1119
+ "num_tokens": 2710396.0,
1120
+ "reward": 0.3043411374092102,
1121
+ "reward_std": 0.29077214002609253,
1122
+ "rewards/humor_reward/mean": 0.3043411374092102,
1123
+ "rewards/humor_reward/std": 0.29077211022377014,
1124
+ "sampling/importance_sampling_ratio/max": 0.019216708838939667,
1125
+ "sampling/importance_sampling_ratio/mean": 0.0028733701910823584,
1126
+ "sampling/importance_sampling_ratio/min": 4.04905886597362e-09,
1127
+ "sampling/sampling_logp_difference/max": 4.360086441040039,
1128
+ "sampling/sampling_logp_difference/mean": 0.04630262777209282,
1129
+ "step": 40,
1130
+ "step_time": 82.58221219903498
1131
+ },
1132
+ {
1133
+ "cispo_clip_ratio": 0.7186006270349026,
1134
+ "completions/clipped_ratio": 1.0,
1135
+ "completions/max_length": 5217.0,
1136
+ "completions/max_terminated_length": 0.0,
1137
+ "completions/mean_length": 3383.0,
1138
+ "completions/mean_terminated_length": 0.0,
1139
+ "completions/min_length": 706.0,
1140
+ "completions/min_terminated_length": 0.0,
1141
+ "entropy": 1.9114204794168472,
1142
+ "epoch": 0.15129151291512916,
1143
+ "frac_reward_zero_std": 0.0,
1144
+ "grad_norm": 0.005707095842808485,
1145
+ "learning_rate": 0.0002,
1146
+ "loss": 0.07725385576486588,
1147
+ "num_tokens": 2768684.0,
1148
+ "reward": 0.6538974642753601,
1149
+ "reward_std": 0.22179268300533295,
1150
+ "rewards/humor_reward/mean": 0.6538974642753601,
1151
+ "rewards/humor_reward/std": 0.22179268300533295,
1152
+ "sampling/importance_sampling_ratio/max": 2.3075239658355713,
1153
+ "sampling/importance_sampling_ratio/mean": 0.24276912212371826,
1154
+ "sampling/importance_sampling_ratio/min": 2.1112403203460417e-07,
1155
+ "sampling/sampling_logp_difference/max": 1.9947752952575684,
1156
+ "sampling/sampling_logp_difference/mean": 0.04460817202925682,
1157
+ "step": 41,
1158
+ "step_time": 50.73555830302939
1159
+ },
1160
+ {
1161
+ "cispo_clip_ratio": 0.4581793490797281,
1162
+ "completions/clipped_ratio": 1.0,
1163
+ "completions/max_length": 6144.0,
1164
+ "completions/max_terminated_length": 0.0,
1165
+ "completions/mean_length": 3443.75,
1166
+ "completions/mean_terminated_length": 0.0,
1167
+ "completions/min_length": 823.0,
1168
+ "completions/min_terminated_length": 0.0,
1169
+ "entropy": 1.684003859758377,
1170
+ "epoch": 0.15498154981549817,
1171
+ "frac_reward_zero_std": 0.0,
1172
+ "grad_norm": 6.124524952610955e-05,
1173
+ "learning_rate": 0.0002,
1174
+ "loss": -0.0007328057545237243,
1175
+ "num_tokens": 2828168.0,
1176
+ "reward": 0.38168156147003174,
1177
+ "reward_std": 0.1816880702972412,
1178
+ "rewards/humor_reward/mean": 0.38168156147003174,
1179
+ "rewards/humor_reward/std": 0.1816880702972412,
1180
+ "sampling/importance_sampling_ratio/max": 0.7620577812194824,
1181
+ "sampling/importance_sampling_ratio/mean": 0.052507393062114716,
1182
+ "sampling/importance_sampling_ratio/min": 0.0,
1183
+ "sampling/sampling_logp_difference/max": 1.3952629566192627,
1184
+ "sampling/sampling_logp_difference/mean": 0.044263970106840134,
1185
+ "step": 42,
1186
+ "step_time": 86.9609940169612
1187
+ },
1188
+ {
1189
+ "cispo_clip_ratio": 0.47743209451436996,
1190
+ "completions/clipped_ratio": 0.9375,
1191
+ "completions/max_length": 6144.0,
1192
+ "completions/max_terminated_length": 957.0,
1193
+ "completions/mean_length": 3697.8125,
1194
+ "completions/mean_terminated_length": 957.0,
1195
+ "completions/min_length": 608.0,
1196
+ "completions/min_terminated_length": 957.0,
1197
+ "entropy": 1.7134400680661201,
1198
+ "epoch": 0.15867158671586715,
1199
+ "frac_reward_zero_std": 0.0,
1200
+ "grad_norm": 0.0007043051300570369,
1201
+ "learning_rate": 0.0002,
1202
+ "loss": -0.0017245950875803828,
1203
+ "num_tokens": 2891493.0,
1204
+ "reward": 0.22081857919692993,
1205
+ "reward_std": 0.221486434340477,
1206
+ "rewards/humor_reward/mean": 0.22081857919692993,
1207
+ "rewards/humor_reward/std": 0.22148646414279938,
1208
+ "sampling/importance_sampling_ratio/max": 0.40315449237823486,
1209
+ "sampling/importance_sampling_ratio/mean": 0.02584739960730076,
1210
+ "sampling/importance_sampling_ratio/min": 2.196902926243638e-07,
1211
+ "sampling/sampling_logp_difference/max": 1.3882695436477661,
1212
+ "sampling/sampling_logp_difference/mean": 0.04035360738635063,
1213
+ "step": 43,
1214
+ "step_time": 79.38622622596449
1215
+ },
1216
+ {
1217
+ "cispo_clip_ratio": 0.4560951702296734,
1218
+ "completions/clipped_ratio": 1.0,
1219
+ "completions/max_length": 6144.0,
1220
+ "completions/max_terminated_length": 0.0,
1221
+ "completions/mean_length": 3416.6875,
1222
+ "completions/mean_terminated_length": 0.0,
1223
+ "completions/min_length": 1422.0,
1224
+ "completions/min_terminated_length": 0.0,
1225
+ "entropy": 2.0115034580230713,
1226
+ "epoch": 0.16236162361623616,
1227
+ "frac_reward_zero_std": 0.0,
1228
+ "grad_norm": 0.00011439115041866899,
1229
+ "learning_rate": 0.0002,
1230
+ "loss": 0.0011536129750311375,
1231
+ "num_tokens": 2950096.0,
1232
+ "reward": 0.44468164443969727,
1233
+ "reward_std": 0.2576386332511902,
1234
+ "rewards/humor_reward/mean": 0.44468164443969727,
1235
+ "rewards/humor_reward/std": 0.2576386630535126,
1236
+ "sampling/importance_sampling_ratio/max": 0.04263285920023918,
1237
+ "sampling/importance_sampling_ratio/mean": 0.004383362829685211,
1238
+ "sampling/importance_sampling_ratio/min": 1.0335932643101842e-07,
1239
+ "sampling/sampling_logp_difference/max": 2.972001552581787,
1240
+ "sampling/sampling_logp_difference/mean": 0.04580405354499817,
1241
+ "step": 44,
1242
+ "step_time": 83.37110035201476
1243
+ },
1244
+ {
1245
+ "cispo_clip_ratio": 0.5093751782551408,
1246
+ "completions/clipped_ratio": 1.0,
1247
+ "completions/max_length": 6144.0,
1248
+ "completions/max_terminated_length": 0.0,
1249
+ "completions/mean_length": 2493.1875,
1250
+ "completions/mean_terminated_length": 0.0,
1251
+ "completions/min_length": 544.0,
1252
+ "completions/min_terminated_length": 0.0,
1253
+ "entropy": 2.2517920583486557,
1254
+ "epoch": 0.16605166051660517,
1255
+ "frac_reward_zero_std": 0.0,
1256
+ "grad_norm": 0.0009305228013545275,
1257
+ "learning_rate": 0.0002,
1258
+ "loss": -0.008580348454415798,
1259
+ "num_tokens": 2994147.0,
1260
+ "reward": 0.36086541414260864,
1261
+ "reward_std": 0.23784013092517853,
1262
+ "rewards/humor_reward/mean": 0.36086541414260864,
1263
+ "rewards/humor_reward/std": 0.23784014582633972,
1264
+ "sampling/importance_sampling_ratio/max": 0.34510719776153564,
1265
+ "sampling/importance_sampling_ratio/mean": 0.02897491306066513,
1266
+ "sampling/importance_sampling_ratio/min": 1.3008460953388423e-13,
1267
+ "sampling/sampling_logp_difference/max": 4.940759658813477,
1268
+ "sampling/sampling_logp_difference/mean": 0.05280106142163277,
1269
+ "step": 45,
1270
+ "step_time": 81.29779967200011
1271
+ },
1272
+ {
1273
+ "cispo_clip_ratio": 0.4961427040398121,
1274
+ "completions/clipped_ratio": 0.9375,
1275
+ "completions/max_length": 6144.0,
1276
+ "completions/max_terminated_length": 1664.0,
1277
+ "completions/mean_length": 2631.6875,
1278
+ "completions/mean_terminated_length": 1664.0,
1279
+ "completions/min_length": 919.0,
1280
+ "completions/min_terminated_length": 1664.0,
1281
+ "entropy": 2.523237004876137,
1282
+ "epoch": 0.16974169741697417,
1283
+ "frac_reward_zero_std": 0.0,
1284
+ "grad_norm": 0.0004199541872367263,
1285
+ "learning_rate": 0.0002,
1286
+ "loss": 0.0025538511108607054,
1287
+ "num_tokens": 3040190.0,
1288
+ "reward": 0.3032156825065613,
1289
+ "reward_std": 0.2560163736343384,
1290
+ "rewards/humor_reward/mean": 0.3032156825065613,
1291
+ "rewards/humor_reward/std": 0.2560163736343384,
1292
+ "sampling/importance_sampling_ratio/max": 0.6374390125274658,
1293
+ "sampling/importance_sampling_ratio/mean": 0.047728560864925385,
1294
+ "sampling/importance_sampling_ratio/min": 3.620046413743694e-07,
1295
+ "sampling/sampling_logp_difference/max": 2.632134437561035,
1296
+ "sampling/sampling_logp_difference/mean": 0.047268252819776535,
1297
+ "step": 46,
1298
+ "step_time": 73.58007367796381
1299
+ },
1300
+ {
1301
+ "cispo_clip_ratio": 0.3577369898557663,
1302
+ "completions/clipped_ratio": 0.9375,
1303
+ "completions/max_length": 6144.0,
1304
+ "completions/max_terminated_length": 1734.0,
1305
+ "completions/mean_length": 2277.4375,
1306
+ "completions/mean_terminated_length": 1734.0,
1307
+ "completions/min_length": 899.0,
1308
+ "completions/min_terminated_length": 1734.0,
1309
+ "entropy": 2.4675562232732773,
1310
+ "epoch": 0.17343173431734318,
1311
+ "frac_reward_zero_std": 0.0,
1312
+ "grad_norm": 0.011461591348052025,
1313
+ "learning_rate": 0.0002,
1314
+ "loss": 0.08860153704881668,
1315
+ "num_tokens": 3081013.0,
1316
+ "reward": 0.2814764380455017,
1317
+ "reward_std": 0.293722927570343,
1318
+ "rewards/humor_reward/mean": 0.2814764380455017,
1319
+ "rewards/humor_reward/std": 0.293722927570343,
1320
+ "sampling/importance_sampling_ratio/max": 1.583497166633606,
1321
+ "sampling/importance_sampling_ratio/mean": 0.19163595139980316,
1322
+ "sampling/importance_sampling_ratio/min": 1.2635299029284397e-08,
1323
+ "sampling/sampling_logp_difference/max": 3.013592481613159,
1324
+ "sampling/sampling_logp_difference/mean": 0.05257761478424072,
1325
+ "step": 47,
1326
+ "step_time": 80.03998614201555
1327
+ },
1328
+ {
1329
+ "cispo_clip_ratio": 0.343157634139061,
1330
+ "completions/clipped_ratio": 0.875,
1331
+ "completions/max_length": 6144.0,
1332
+ "completions/max_terminated_length": 1813.0,
1333
+ "completions/mean_length": 1572.6875,
1334
+ "completions/mean_terminated_length": 1556.0,
1335
+ "completions/min_length": 505.0,
1336
+ "completions/min_terminated_length": 1299.0,
1337
+ "entropy": 2.2395776584744453,
1338
+ "epoch": 0.17712177121771217,
1339
+ "frac_reward_zero_std": 0.0,
1340
+ "grad_norm": 0.0015310291200876236,
1341
+ "learning_rate": 0.0002,
1342
+ "loss": 0.007011768873780966,
1343
+ "num_tokens": 3110336.0,
1344
+ "reward": 0.25455841422080994,
1345
+ "reward_std": 0.21981197595596313,
1346
+ "rewards/humor_reward/mean": 0.25455841422080994,
1347
+ "rewards/humor_reward/std": 0.21981199085712433,
1348
+ "sampling/importance_sampling_ratio/max": 1.104674220085144,
1349
+ "sampling/importance_sampling_ratio/mean": 0.09266799688339233,
1350
+ "sampling/importance_sampling_ratio/min": 6.321847649815027e-07,
1351
+ "sampling/sampling_logp_difference/max": 2.861496925354004,
1352
+ "sampling/sampling_logp_difference/mean": 0.048282306641340256,
1353
+ "step": 48,
1354
+ "step_time": 74.09863769897493
1355
+ },
1356
+ {
1357
+ "cispo_clip_ratio": 0.2400471270084381,
1358
+ "completions/clipped_ratio": 1.0,
1359
+ "completions/max_length": 6144.0,
1360
+ "completions/max_terminated_length": 0.0,
1361
+ "completions/mean_length": 1609.875,
1362
+ "completions/mean_terminated_length": 0.0,
1363
+ "completions/min_length": 105.0,
1364
+ "completions/min_terminated_length": 0.0,
1365
+ "entropy": 2.189691785722971,
1366
+ "epoch": 0.18081180811808117,
1367
+ "frac_reward_zero_std": 0.0,
1368
+ "grad_norm": 0.005181934684514999,
1369
+ "learning_rate": 0.0002,
1370
+ "loss": 0.004273830913007259,
1371
+ "num_tokens": 3140478.0,
1372
+ "reward": 0.2623010277748108,
1373
+ "reward_std": 0.2546696662902832,
1374
+ "rewards/humor_reward/mean": 0.2623010277748108,
1375
+ "rewards/humor_reward/std": 0.2546696662902832,
1376
+ "sampling/importance_sampling_ratio/max": 1.6369633674621582,
1377
+ "sampling/importance_sampling_ratio/mean": 0.1263563483953476,
1378
+ "sampling/importance_sampling_ratio/min": 0.0,
1379
+ "sampling/sampling_logp_difference/max": 1.9059357643127441,
1380
+ "sampling/sampling_logp_difference/mean": 0.043733954429626465,
1381
+ "step": 49,
1382
+ "step_time": 75.97332381599699
1383
+ },
1384
+ {
1385
+ "cispo_clip_ratio": 0.43684835731983185,
1386
+ "completions/clipped_ratio": 0.9375,
1387
+ "completions/max_length": 2413.0,
1388
+ "completions/max_terminated_length": 791.0,
1389
+ "completions/mean_length": 951.875,
1390
+ "completions/mean_terminated_length": 791.0,
1391
+ "completions/min_length": 434.0,
1392
+ "completions/min_terminated_length": 791.0,
1393
+ "entropy": 2.404914140701294,
1394
+ "epoch": 0.18450184501845018,
1395
+ "frac_reward_zero_std": 0.0,
1396
+ "grad_norm": 0.00264533469453454,
1397
+ "learning_rate": 0.0002,
1398
+ "loss": 0.007856368087232113,
1399
+ "num_tokens": 3159868.0,
1400
+ "reward": 0.24542748928070068,
1401
+ "reward_std": 0.2106892615556717,
1402
+ "rewards/humor_reward/mean": 0.24542748928070068,
1403
+ "rewards/humor_reward/std": 0.21068927645683289,
1404
+ "sampling/importance_sampling_ratio/max": 0.8078871965408325,
1405
+ "sampling/importance_sampling_ratio/mean": 0.06913819909095764,
1406
+ "sampling/importance_sampling_ratio/min": 8.870298984220426e-08,
1407
+ "sampling/sampling_logp_difference/max": 3.869755744934082,
1408
+ "sampling/sampling_logp_difference/mean": 0.0640038326382637,
1409
+ "step": 50,
1410
+ "step_time": 23.586737716992502
1411
+ }
1412
+ ],
1413
+ "logging_steps": 1,
1414
+ "max_steps": 200,
1415
+ "num_input_tokens_seen": 3159868,
1416
+ "num_train_epochs": 1,
1417
+ "save_steps": 50,
1418
+ "stateful_callbacks": {
1419
+ "TrainerControl": {
1420
+ "args": {
1421
+ "should_epoch_stop": false,
1422
+ "should_evaluate": false,
1423
+ "should_log": false,
1424
+ "should_save": true,
1425
+ "should_training_stop": false
1426
+ },
1427
+ "attributes": {}
1428
+ }
1429
+ },
1430
+ "total_flos": 0.0,
1431
+ "train_batch_size": 2,
1432
+ "trial_name": null,
1433
+ "trial_params": null
1434
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b58498f5c0b9297dffe3c5269aa516b68c7e2266979cd4f963c297338e66a05
3
+ size 7377