Lekr0 commited on 4 days ago

Commit

7772197

verified ·

1 Parent(s): b6fb2b0

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

ICL/DAPO/verl-recipe/collabllm/config/agent.yaml +2 -0
ICL/DAPO/verl-recipe/collabllm/config/collabllm_interaction_config.yaml +10 -0
ICL/DAPO/verl-recipe/collabllm/metrics/token_amount.py +26 -0
ICL/DAPO/verl-recipe/dapo/config/dapo_trainer.yaml +28 -0
ICL/DAPO/verl-recipe/deepeyes/configs/deepeyes_multiturn_grpo.yaml +32 -0
ICL/DAPO/verl-recipe/entropy/config/entropy_trainer.yaml +39 -0
ICL/DAPO/verl-recipe/entropy/reward_score/__init__.py +38 -0
ICL/DAPO/verl-recipe/entropy/reward_score/entropy_math/__init__.py +1062 -0
ICL/DAPO/verl-recipe/entropy/reward_score/entropy_math/grader.py +384 -0
ICL/DAPO/verl-recipe/fapo/config/rm_config.yaml +49 -0
ICL/DAPO/verl-recipe/fapo/run_fapo_genrm_train.sh +138 -0
ICL/DAPO/verl-recipe/fapo/runtime_env.yaml +5 -0
ICL/DAPO/verl-recipe/fault_recover/agent_loop/__init__.py +20 -0
ICL/DAPO/verl-recipe/fault_recover/agent_loop/fault_recover_single_turn_agent_loop.py +111 -0
ICL/DAPO/verl-recipe/fault_recover/config/fault_recover_ppo_megatron_trainer.yaml +265 -0
ICL/DAPO/verl-recipe/fault_recover/vllm_rollout/__init__.py +13 -0
ICL/DAPO/verl-recipe/fault_recover/vllm_rollout/vllm_async_server.py +104 -0
ICL/DAPO/verl-recipe/flowrl/config/flowrl_trainer.yaml +33 -0
ICL/DAPO/verl-recipe/flowrl/figures/file.svg +135 -0
ICL/DAPO/verl-recipe/flowrl/figures/flowrl.pdf +0 -0
ICL/DAPO/verl-recipe/flowrl/prepare/prepare_data.sh +43 -0
ICL/DAPO/verl-recipe/flowrl/prepare/prepare_model.sh +10 -0
ICL/DAPO/verl-recipe/gkd/megatron/megatron_utils.py +200 -0
ICL/DAPO/verl-recipe/gvpo/config/gvpo_trainer.yaml +15 -0
ICL/DAPO/verl-recipe/langgraph_agent/example/README.md +138 -0
ICL/DAPO/verl-recipe/langgraph_agent/example/agent.yaml +2 -0
ICL/DAPO/verl-recipe/langgraph_agent/example/create_dataset.py +290 -0
ICL/DAPO/verl-recipe/langgraph_agent/example/math_expression.py +38 -0
ICL/DAPO/verl-recipe/langgraph_agent/example/run_gpt_oss_20b_bf16.sh +143 -0
ICL/DAPO/verl-recipe/langgraph_agent/example/run_qwen2.5_3b.sh +145 -0
ICL/DAPO/verl-recipe/open_math_reasoning/run_eval.sh +7 -0
ICL/DAPO/verl-recipe/prime/config/prime_trainer.yaml +77 -0
ICL/DAPO/verl-recipe/qat/config/nvfp4_w4a16.json +34 -0
ICL/DAPO/verl-recipe/qat/config/nvfp4_w4a4.json +45 -0
ICL/DAPO/verl-recipe/r1/config/evaluation.yaml +14 -0
ICL/DAPO/verl-recipe/r1/tasks/math_reward.py +35 -0
ICL/DAPO/verl-recipe/r1_ascend/figures/response_len.png +0 -0
ICL/DAPO/verl-recipe/r1_ascend/figures/rewards.png +0 -0
ICL/DAPO/verl-recipe/r1_ascend/figures/val_score.png +0 -0
ICL/DAPO/verl-recipe/rep_exp/config/_generated_ppo_megatron_trainer.yaml +594 -0
ICL/DAPO/verl-recipe/rep_exp/config/_generated_ppo_trainer.yaml +563 -0
ICL/DAPO/verl-recipe/rep_exp/config/actor/actor.yaml +215 -0
ICL/DAPO/verl-recipe/rep_exp/config/actor/dp_actor.yaml +43 -0
ICL/DAPO/verl-recipe/rep_exp/config/actor/megatron_actor.yaml +20 -0
ICL/DAPO/verl-recipe/rep_exp/config/algorithm/rollout_correction.yaml +30 -0
ICL/DAPO/verl-recipe/rep_exp/config/critic/critic.yaml +176 -0
ICL/DAPO/verl-recipe/rep_exp/config/critic/dp_critic.yaml +66 -0
ICL/DAPO/verl-recipe/rep_exp/config/critic/megatron_critic.yaml +43 -0
ICL/DAPO/verl-recipe/rep_exp/config/data/legacy_data.yaml +131 -0
ICL/DAPO/verl-recipe/rep_exp/config/engine/fsdp.yaml +56 -0

ICL/DAPO/verl-recipe/collabllm/config/agent.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ - name: collabllm_agent
2	+ _target_: recipe.collabllm.collabllm_agent_loop.CollabLLMAgentLoop

ICL/DAPO/verl-recipe/collabllm/config/collabllm_interaction_config.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+interaction:
+  - name: "collabllm"
+    class_name: "recipe.collabllm.collabllm_interation.CollabLLMInteraction"
+    config: {
+      "user_model": "gpt-4o-mini",
+      "num_retries": 3,
+      "max_tokens": 512,
+      "temperature": 1.0,
+      "enable_log": True
+    }

ICL/DAPO/verl-recipe/collabllm/metrics/token_amount.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright 2025 CollabLLM team and/or its affiliates
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def compute_score(data_source, messages, ground_truth, extra_info, **kwargs):
+    prompt = extra_info["prompt"]
+    # Calculate the token penalty based on the length of the prompt
+    future_conv = messages[len(prompt) :]
+    # simple length estimation
+    total_tokens = sum(len(m.content.split()) for m in future_conv)
+    return total_tokens

ICL/DAPO/verl-recipe/dapo/config/dapo_trainer.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+hydra:
+  searchpath:
+    - file://verl/trainer/config
+defaults:
+  - ppo_trainer
+  - _self_
+data:
+  gen_batch_size: ${data.train_batch_size}
+reward_model:
+  reward_manager: dapo
+  overlong_buffer:
+    enable: False # We try to avoid forgetting to set enable
+    len: 0
+    penalty_factor: 0.0
+    log: False
+algorithm:
+  filter_groups:
+    _target_: verl.trainer.config.FilterGroupsConfig
+    enable: False # We try to avoid forgetting to set enable
+    metric: null # acc / score / seq_reward / seq_final_reward / ...
+    max_num_gen_batches: 0 # Non-positive values mean no upper limit
+trainer:
+  project_name: verl-dapo

ICL/DAPO/verl-recipe/deepeyes/configs/deepeyes_multiturn_grpo.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+hydra:
+  searchpath:
+    - file://verl/trainer/config
+defaults:
+  - ppo_trainer
+  - _self_
+data:
+  max_prompt_length: 2048
+  max_response_length: 2048
+  train_batch_size: 256
+  return_raw_chat: True
+  return_multi_modal_inputs: False
+  custom_cls:
+    path: "recipe/deepeyes/deepeyes.py"
+    name: CustomRLHFDataset
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    custom_chat_template: "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{%- if tools %}{{- '<|im_start|>system\\n' }}{%- if messages[0]['role'] == 'system' %}{%- if messages[0]['content'] is string %}{{- messages[0]['content'] }}{%- else %}{{- messages[0]['content'][0]['text'] }}{%- endif %}{%- else %}{{- 'You are a helpful assistant.' }}{%- endif %}{{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}{%- for tool in tools %}{{- \"\\n\" }}{{- tool | tojson }}{%- endfor %}{{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}{% for message in messages %}{% if message['role'] != 'system' or loop.first == false %}{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{%- elif message.role == \"assistant\" %}{{- '<|im_start|>' + message.role }}{%- if message.content %}{{- '\\n' + message.content }}{%- endif %}{%- for tool_call in message.tool_calls %}{%- if tool_call.function is defined %}{%- set tool_call = tool_call.function %}{%- endif %}{{- '\\n<tool_call>\\n{\"name\": \"' }}{{- tool_call.name }}{{- '\", \"arguments\": ' }}{{- tool_call.arguments | tojson }}{{- '}\\n</tool_call>' }}{%- endfor %}{{- '<|im_end|>\\n' }}{%- elif message.role == \"tool\" %}{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}{{- '<|im_start|>user' }}{%- endif %}{{- '\\n<tool_response>\\n' }}{% if message['content'] is string %}{{ message.content }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif content['type'] == 'text' or 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{- '\\n</tool_response>' }}{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}{{- '<|im_end|>\\n' }}{%- endif %}{%- endif %}{% endif %}{% endfor %}{%- else %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{%- elif message.role == \"assistant\" %}{{- '<|im_start|>' + message.role }}{%- if message.content %}{{- '\\n' + message.content }}{%- endif %}{%- for tool_call in message.tool_calls %}{%- if tool_call.function is defined %}{%- set tool_call = tool_call.function %}{%- endif %}{{- '\\n<tool_call>\\n{\"name\": \"' }}{{- tool_call.name }}{{- '\", \"arguments\": ' }}{{- tool_call.arguments | tojson }}{{- '}\\n</tool_call>' }}{%- endfor %}{{- '<|im_end|>\\n' }}{%- elif message.role == \"tool\" %}{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}{{- '<|im_start|>user' }}{%- endif %}{{- '\\n<tool_response>\\n' }}{% if message['content'] is string %}{{ message.content }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif content['type'] == 'text' or 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{- '\\n</tool_response>' }}{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}{{- '<|im_end|>\\n' }}{%- endif %}{%- endif %}{% endfor %}{%- endif %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+  rollout:
+    name: sglang
+    multi_turn:
+      enable: True
+      max_assistant_turns: 5
+      tool_config_path: "recipe/deepeyes/config/image_zoom_in_tool_config.yaml"
+custom_reward_function:
+  path: "recipe/deepeyes/deepeyes.py"
+  name: compute_score

ICL/DAPO/verl-recipe/entropy/config/entropy_trainer.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+hydra:
+  searchpath:
+    - file://verl/trainer/config
+defaults:
+  - ppo_trainer
+  - _self_
+data:
+  gen_batch_size: ${data.train_batch_size}
+reward_model:
+  reward_kwargs:
+        overlong_buffer_cfg: ${reward_model.overlong_buffer}
+  reward_manager: dapo
+  overlong_buffer:
+    enable: False
+    len: 0
+    penalty_factor: 0.0
+    log: False
+algorithm:
+  filter_groups:
+    enable: False # We try to avoid forgetting to set enable
+    metric: null # acc / score / seq_reward / seq_final_reward / ...
+    max_num_gen_batches: 0 # Non-positive values mean no upper limit
+trainer:
+  project_name: verl-entropy
+actor_rollout_ref:
+  actor:
+    policy_loss:
+      loss_mode: "vanilla" # /clip-cov / kl-cov from https://arxiv.org/abs/2505.
+      clip_cov_ratio: 0.0002 # for clip-cov loss
+      clip_cov_lb: 1.0 # for clip-cov loss
+      clip_cov_ub: 5.0 # for clip-cov loss
+      kl_cov_ratio: 0.0002 # for kl-cov loss
+      ppo_kl_coef: 0.1 # for kl-cov loss

ICL/DAPO/verl-recipe/entropy/reward_score/__init__.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# from . import gsm8k, math, prime_math, prime_code
+import traceback
+from . import entropy_math
+def _default_compute_score(
+    data_source, solution_str, ground_truth, extra_info=None, sandbox_fusion_url=None, concurrent_semaphore=None
+):
+    try:
+        res = entropy_math.compute_score(solution_str, str(ground_truth))
+        # print(f"data_source: {data_source}")
+        # raise NotImplementedError(f"Reward function is not implemented for {data_source=}")
+        if isinstance(res, dict):
+            return res
+        elif isinstance(res, int | float | bool):
+            return float(res)
+        else:
+            return float(res[0])
+    except Exception as e:
+        print(f"[ERROR] Error in process_completion for task : {str(e)}")
+        traceback.print_exc()  # 打印完整堆栈
+        raise  # 重新抛出异常以便上层捕获

ICL/DAPO/verl-recipe/entropy/reward_score/entropy_math/__init__.py ADDED Viewed

	@@ -0,0 +1,1062 @@

+# Copyright 2024 PRIME team and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except Exception in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Provides a math answer grading function with high recall.
+Based on HF math_verify, verl, open reasoner zero, etc.
+"""
+import os
+import re
+import signal
+from itertools import islice, zip_longest
+from math import isclose
+from typing import Optional
+import sympy
+from latex2sympy2_extended import latex2sympy
+from math_verify import ExprExtractionConfig, LatexExtractionConfig, parse, verify
+from pylatexenc import latex2text
+from sympy import N, simplify
+from sympy.parsing import sympy_parser
+from sympy.parsing.latex import parse_latex
+from sympy.parsing.sympy_parser import parse_expr
+"""
+This code is adapted from: Dr. GRPO (https://github.com/sail-sg/understand-r1-zero/blob/main/understand_r1_zero/math_grader.py).
+"""
+def timeout_ours(timeout_seconds: int = 8):
+    if os.name == "posix":
+        import signal
+        def decorator(func):
+            def handler(signum, frame):
+                raise TimeoutError("Operation timed out!")
+            def wrapper(*args, **kwargs):
+                old_handler = signal.getsignal(signal.SIGALRM)
+                signal.signal(signal.SIGALRM, handler)
+                signal.alarm(timeout_seconds)
+                try:
+                    return func(*args, **kwargs)
+                finally:
+                    signal.alarm(0)
+                    signal.signal(signal.SIGALRM, old_handler)
+            return wrapper
+        return decorator
+    else:
+        raise NotImplementedError(f"Unsupported OS: {os.name}")
+# Dan Hendrycks' code
+def mathd_normalize_answer(answer: Optional[str]) -> Optional[str]:
+    if answer is None:
+        return None
+    answer = answer.strip()
+    try:
+        # Remove enclosing `\text{}`.
+        m = re.search(r"^\\text\{(?P<text>.+?)\}$", answer)
+        if m is not None:
+            answer = m.group("text").strip()
+        return _strip_string(answer)
+    except Exception:
+        return answer
+# units mainly from MathQA
+unit_texts = [
+    "east",
+    "degree",
+    "mph",
+    "kmph",
+    "ft",
+    "m square",
+    " m east",
+    "sq m",
+    "deg",
+    "mile",
+    "q .",
+    "monkey",
+    "prime",
+    "ratio",
+    "profit of rs",
+    "rd",
+    "o",
+    "gm",
+    "p . m",
+    "lb",
+    "tile",
+    "per",
+    "dm",
+    "lt",
+    "gain",
+    "ab",
+    "way",
+    "west",
+    "a .",
+    "b .",
+    "c .",
+    "d .",
+    "e .",
+    "f .",
+    "g .",
+    "h .",
+    "t",
+    "a",
+    "h",
+    "no change",
+    "men",
+    "soldier",
+    "pie",
+    "bc",
+    "excess",
+    "st",
+    "inches",
+    "noon",
+    "percent",
+    "by",
+    "gal",
+    "kmh",
+    "c",
+    "acre",
+    "rise",
+    "a . m",
+    "th",
+    "π r 2",
+    "sq",
+    "mark",
+    "l",
+    "toy",
+    "coin",
+    "sq . m",
+    "gallon",
+    "° f",
+    "profit",
+    "minw",
+    "yr",
+    "women",
+    "feet",
+    "am",
+    "pm",
+    "hr",
+    "cu cm",
+    "square",
+    "v â € ™",
+    "are",
+    "rupee",
+    "rounds",
+    "cubic",
+    "cc",
+    "mtr",
+    "s",
+    "ohm",
+    "number",
+    "kmph",
+    "day",
+    "hour",
+    "minute",
+    "min",
+    "second",
+    "man",
+    "woman",
+    "sec",
+    "cube",
+    "mt",
+    "sq inch",
+    "mp",
+    "∏ cm ³",
+    "hectare",
+    "more",
+    "sec",
+    "unit",
+    "cu . m",
+    "cm 2",
+    "rs .",
+    "rs",
+    "kg",
+    "g",
+    "month",
+    "km",
+    "m",
+    "cm",
+    "mm",
+    "apple",
+    "liter",
+    "loss",
+    "yard",
+    "pure",
+    "year",
+    "increase",
+    "decrease",
+    "d",
+    "less",
+    "Surface",
+    "litre",
+    "pi sq m",
+    "s .",
+    "metre",
+    "meter",
+    "inch",
+]
+unit_texts.extend([t + "s" for t in unit_texts])
+def _strip_string(string):
+    def _fix_fracs(string):
+        substrs = string.split("\\frac")
+        new_str = substrs[0]
+        if len(substrs) > 1:
+            substrs = substrs[1:]
+            for substr in substrs:
+                new_str += "\\frac"
+                if substr[0] == "{":
+                    new_str += substr
+                else:
+                    try:
+                        assert len(substr) >= 2
+                    except Exception:
+                        return string
+                    a = substr[0]
+                    b = substr[1]
+                    if b != "{":
+                        if len(substr) > 2:
+                            post_substr = substr[2:]
+                            new_str += "{" + a + "}{" + b + "}" + post_substr
+                        else:
+                            new_str += "{" + a + "}{" + b + "}"
+                    else:
+                        if len(substr) > 2:
+                            post_substr = substr[2:]
+                            new_str += "{" + a + "}" + b + post_substr
+                        else:
+                            new_str += "{" + a + "}" + b
+        string = new_str
+        return string
+    def _fix_a_slash_b(string):
+        if len(string.split("/")) != 2:
+            return string
+        a = string.split("/")[0]
+        b = string.split("/")[1]
+        try:
+            a = int(a)
+            b = int(b)
+            assert string == "{}/{}".format(a, b)
+            new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+            return new_string
+        except Exception:
+            return string
+    def _remove_right_units(string):
+        # "\\text{ " only ever occurs (at least in the val set) when describing units
+        if "\\text{ " in string:
+            splits = string.split("\\text{ ")
+            assert len(splits) == 2
+            return splits[0]
+        else:
+            return string
+    def _fix_sqrt(string):
+        if "\\sqrt" not in string:
+            return string
+        splits = string.split("\\sqrt")
+        new_string = splits[0]
+        for split in splits[1:]:
+            if split[0] != "{":
+                a = split[0]
+                new_substr = "\\sqrt{" + a + "}" + split[1:]
+            else:
+                new_substr = "\\sqrt" + split
+            new_string += new_substr
+        return new_string
+    # linebreaks
+    string = string.replace("\n", "")
+    # print(string)
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+    # print(string)
+    # replace \\ with \
+    string = string.replace("\\\\", "\\")
+    # print(string)
+    # matrix
+    string = re.sub(r"\\begin\{array\}\{.*?\}", r"\\begin{pmatrix}", string)
+    string = re.sub(r"\\end\{array\}", r"\\end{pmatrix}", string)
+    string = string.replace("bmatrix", "pmatrix")
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    string = string.replace("\\neq", "\\ne").replace("\\leq", "\\le").replace("\\geq", "\\ge")
+    # print(string)
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+    # print(string)
+    # Remove unit: miles, dollars if after is not none
+    _string = re.sub(r"\\text{.*?}$", "", string).strip()
+    if _string != "" and _string != string:
+        # print("Warning: unit not removed: '{}' -> '{}'".format(string, _string))
+        string = _string
+    # Remove unit: texts
+    for _ in range(2):
+        for unit_text in unit_texts:
+            # use regex, the prefix should be either the start of the string or a non-alphanumeric character
+            # the suffix should be either the end of the string or a non-alphanumeric character
+            _string = re.sub(r"(^|\W)" + unit_text + r"($|\W)", r"\1\2", string)
+            if _string != "":
+                string = _string
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+    # remove dollar signs
+    string = string.replace("\\$", "")
+    # remove units (on the right)
+    string = _remove_right_units(string)
+    # remove percentage
+    string = string.replace("\\\\%", "")
+    string = string.replace("\\%", "")
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split("=")) == 2:
+        if len(string.split("=")[0]) <= 2:
+            string = string.split("=")[1]
+    # fix sqrt3 --> sqrt{3}
+    string = _fix_sqrt(string)
+    # remove spaces
+    string = string.replace(" ", "")
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1).
+    # Also does a/b --> \\frac{a}{b}
+    string = _fix_fracs(string)
+    # manually change 0.5 --> \frac{1}{2}
+    if string == "0.5":
+        string = "\\frac{1}{2}"
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = _fix_a_slash_b(string)
+    return string
+SUBSTITUTIONS = [
+    ("an ", ""),
+    ("a ", ""),
+    (".$", "$"),
+    ("\\$", ""),
+    (r"\ ", ""),
+    (" ", ""),
+    ("mbox", "text"),
+    (",\\text{and}", ","),
+    ("\\text{and}", ","),
+    ("\\text{m}", "\\text{}"),
+]
+REMOVED_EXPRESSIONS = [
+    "square",
+    "ways",
+    "integers",
+    "dollars",
+    "mph",
+    "inches",
+    "ft",
+    "hours",
+    "km",
+    "units",
+    "\\ldots",
+    "sue",
+    "points",
+    "feet",
+    "minutes",
+    "digits",
+    "cents",
+    "degrees",
+    "cm",
+    "gm",
+    "pounds",
+    "meters",
+    "meals",
+    "edges",
+    "students",
+    "childrentickets",
+    "multiples",
+    "\\text{s}",
+    "\\text{.}",
+    "\\text{\ns}",
+    "\\text{}^2",
+    "\\text{}^3",
+    "\\text{\n}",
+    "\\text{}",
+    r"\mathrm{th}",
+    r"^\circ",
+    r"^{\circ}",
+    r"\;",
+    r",\!",
+    "{,}",
+    '"',
+    "\\dots",
+]
+def normalize_final_answer(final_answer: str) -> str:
+    """
+    Normalize a final answer to a quantitative reasoning question.
+    This code comes from https://arxiv.org/pdf/2206.14858.pdf, page18.
+    """
+    # final_answer = final_answer.split("=")[-1]
+    for before, after in SUBSTITUTIONS:
+        final_answer = final_answer.replace(before, after)
+    for expr in REMOVED_EXPRESSIONS:
+        final_answer = final_answer.replace(expr, "")
+    # Extract answer that is in LaTeX math, is bold,
+    # is surrounded by a box, etc.
+    final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
+    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
+    # Normalize shorthand TeX:
+    # \fracab -> \frac{a}{b}
+    # \frac{abc}{bef} -> \frac{abc}{bef}
+    # \fracabc -> \frac{a}{b}c
+    # \sqrta -> \sqrt{a}
+    # \sqrtab -> sqrt{a}b
+    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
+    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
+    final_answer = final_answer.replace("$", "")
+    # Normalize 100,000 -> 100000
+    if final_answer.replace(",", "").isdigit():
+        final_answer = final_answer.replace(",", "")
+    return final_answer
+def repeatness(s: str):
+    def ranks(seq):
+        index = {v: i for i, v in enumerate(sorted(set(seq)))}
+        return [index[v] for v in seq]
+    def suffixArray(s):
+        line = ranks(s)
+        n, k, ans, sa = len(s), 1, line, [0] * len(s)
+        while k < n - 1:
+            line = ranks(list(zip_longest(line, islice(line, k, None), fillvalue=-1)))
+            ans, k = line, k << 1
+        for i, k in enumerate(ans):
+            sa[k] = i
+        return ans, sa
+    def lcp(arr, suffixArr, inv_suff):
+        n, ans, k = len(arr), [0] * len(arr), 0
+        for i in range(n):
+            if inv_suff[i] == n - 1:
+                k = 0
+                continue
+            j = suffixArr[inv_suff[i] + 1]
+            while i + k < n and j + k < n and arr[i + k] == arr[j + k]:
+                k += 1
+            ans[inv_suff[i]] = k
+            if k > 0:
+                k -= 1
+        return ans
+    arr = [ord(i) for i in s]
+    n = len(arr)
+    if n <= 1:
+        return 0
+    c, sa = suffixArray(arr)
+    cnt = sum(lcp(arr, sa, c))
+    return (cnt * 2 / (n * (n + 1))) > 0.2
+class timeout:
+    def __init__(self, seconds=1, error_message="Timeout"):
+        self.seconds = seconds
+        self.error_message = error_message
+    def handle_timeout(self, signum, frame):
+        raise TimeoutError(self.error_message)
+    def __enter__(self):
+        signal.signal(signal.SIGALRM, self.handle_timeout)
+        signal.alarm(self.seconds)
+    def __exit__(self, type, value, traceback):
+        signal.alarm(0)
+def latex_eval(latex):
+    sym = parse_latex(latex)
+    val = sym.evalf()
+    return sym, val
+def numeric_equal(prediction: float, reference: float):
+    # Note that relative tolerance has significant impact
+    # on the result of the synthesized GSM-Hard dataset
+    # if reference.is_integer():
+    #     return isclose(reference, round(prediction), abs_tol=1e-4)
+    # else:
+    # prediction = round(prediction, len(str(reference).split(".")[-1]))
+    return isclose(reference, prediction, rel_tol=1e-4)
+@timeout_ours(timeout_seconds=5)
+def symbolic_equal(a, b):
+    def _parse(s):
+        for f in [parse_latex, parse_expr, latex2sympy]:
+            try:
+                return f(s.replace("\\\\", "\\"))
+            except Exception:
+                try:
+                    return f(s)
+                except Exception:
+                    pass
+        return s
+    a = _parse(a)
+    b = _parse(b)
+    # direct equal
+    try:
+        if str(a) == str(b) or a == b:
+            return True
+    except Exception:
+        pass
+    # simplify equal
+    try:
+        if a.equals(b) or simplify(a - b) == 0:
+            return True
+    except Exception:
+        pass
+    # equation equal
+    try:
+        if (abs(a.lhs - a.rhs)).equals(abs(b.lhs - b.rhs)):
+            return True
+    except Exception:
+        pass
+    try:
+        if numeric_equal(float(N(a)), float(N(b))):
+            return True
+    except Exception:
+        pass
+    # matrix
+    try:
+        # if a and b are matrix
+        if a.shape == b.shape:
+            _a = a.applyfunc(lambda x: round(x, 3))
+            _b = b.applyfunc(lambda x: round(x, 3))
+            if _a.equals(_b):
+                return True
+    except Exception:
+        pass
+    return False
+def _is_latex_equal(str1, str2):
+    try:
+        sym1, val1 = latex_eval(str1)
+        sym2, val2 = latex_eval(str2)
+        if sym1 == sym2 or val1 == val2:
+            return True
+        else:
+            raise ValueError
+    except Exception:
+        try:
+            norm1, norm2 = normalize_final_answer(str1), normalize_final_answer(str2)
+            sym1, val1 = latex_eval(norm1)
+            sym2, val2 = latex_eval(norm2)
+            if sym1 == sym2 or val1 == val2:
+                return True
+        except Exception:
+            return norm1 == norm2
+    return False
+def is_latex_equal(given_answer: str, ground_truth: str) -> bool:
+    try:
+        with timeout(1):
+            try:
+                if (len(given_answer) > 128 and repeatness(given_answer)) or (
+                    len(ground_truth) > 128 and repeatness(ground_truth)
+                ):
+                    return False
+                # First conduct normalized string matching.
+                ground_truth_normalized = _normalize(ground_truth)
+                given_normalized = _normalize(given_answer)
+                if ground_truth_normalized is None:
+                    return False
+                if ground_truth_normalized == given_normalized:
+                    return True
+                # Next call math verify.
+                given_answer.replace("\n", "")
+                ground_truth.replace("\n", "")
+                if "$" not in given_answer:
+                    given_answer = f"${given_answer}$"
+                if "$" not in ground_truth:
+                    ground_truth = f"${ground_truth}$"
+                return verify(
+                    parse(
+                        ground_truth,
+                        extraction_config=(
+                            LatexExtractionConfig(boxed_match_priority=0),
+                            ExprExtractionConfig(),
+                        ),
+                        fallback_mode="no_fallback",
+                        extraction_mode=["first_match"],
+                        parsing_timeout=1,
+                    ),
+                    parse(
+                        given_answer,
+                        extraction_config=(
+                            LatexExtractionConfig(boxed_match_priority=0),
+                            ExprExtractionConfig(),
+                        ),
+                        fallback_mode="no_fallback",
+                        extraction_mode=["first_match"],
+                        parsing_timeout=1,
+                    ),
+                    timeout_seconds=1,
+                )
+                # or symbolic_equal(ground_truth, given_answer)
+            except Exception:
+                return False
+    except TimeoutError:
+        return False
+def is_value_equal(given_answer: str, ground_truth: str) -> bool:
+    assert ground_truth is not None
+    ground_truth_normalized_mathd = mathd_normalize_answer(ground_truth)
+    given_answer_normalized_mathd = mathd_normalize_answer(given_answer)
+    str_equal = ground_truth_normalized_mathd == given_answer_normalized_mathd
+    try:
+        number_equal = float(ground_truth_normalized_mathd) == float(given_answer_normalized_mathd)
+        return str_equal or number_equal
+    except Exception:
+        return str_equal
+# sympy might hang -- we don't care about trying to be lenient in these cases
+BAD_SUBSTRINGS = ["^{", "^("]
+BAD_REGEXES = [r"\^[0-9]+\^", r"\^[0-9][0-9]+"]
+TUPLE_CHARS = "()[]"
+def _sympy_parse(expr: str):
+    """Parses an expression with sympy."""
+    py_expr = expr.replace("^", "**")
+    return sympy_parser.parse_expr(
+        py_expr,
+        transformations=(sympy_parser.standard_transformations + (sympy_parser.implicit_multiplication_application,)),
+    )
+def _parse_latex(expr: str) -> str:
+    """Attempts to parse latex to an expression sympy can read."""
+    expr = expr.replace("\\tfrac", "\\frac")
+    expr = expr.replace("\\dfrac", "\\frac")
+    expr = expr.replace("\\frac", " \\frac")  # Play nice with mixed numbers.
+    expr = latex2text.LatexNodes2Text().latex_to_text(expr)
+    # Replace the specific characters that this parser uses.
+    expr = expr.replace("√", "sqrt")
+    expr = expr.replace("π", "pi")
+    expr = expr.replace("∞", "inf")
+    expr = expr.replace("∪", "U")
+    expr = expr.replace("·", "*")
+    expr = expr.replace("×", "*")
+    return expr.strip()
+def _is_float(num: str) -> bool:
+    try:
+        float(num)
+        return True
+    except ValueError:
+        return False
+def _is_int(x: float) -> bool:
+    try:
+        return abs(x - int(round(x))) <= 1e-7
+    except Exception:
+        return False
+def _is_frac(expr: str) -> bool:
+    return bool(re.search(r"^-?[0-9]+.?/0*[1-9][0-9]*.?$", expr))
+def _str_is_int(x: str) -> bool:
+    try:
+        x = _strip_properly_formatted_commas(x)
+        x = float(x)
+        return abs(x - int(round(x))) <= 1e-7
+    except Exception:
+        return False
+def _str_to_int(x: str) -> bool:
+    x = x.replace(",", "")
+    x = float(x)
+    return int(x)
+def _inject_implicit_mixed_number(step: str):
+    """
+    Automatically make a mixed number evalable
+    e.g. 7 3/4 => 7+3/4
+    """
+    p1 = re.compile("([0-9]) +([0-9])")
+    step = p1.sub("\\1+\\2", step)  ## implicit mults
+    return step
+def _strip_properly_formatted_commas(expr: str):
+    # We want to be careful because we don't want to strip tuple commas
+    p1 = re.compile(r"(\d)(,)(\d\d\d)($|\D)")
+    while True:
+        next_expr = p1.sub("\\1\\3\\4", expr)
+        if next_expr == expr:
+            break
+        expr = next_expr
+    return next_expr
+def _normalize(expr: str) -> str:
+    """Normalize answer expressions."""
+    if expr is None:
+        return None
+    # Remove enclosing `\text{}`.
+    m = re.search(r"^\\text\{(?P<text>.+?)\}$", expr)
+    if m is not None:
+        expr = m.group("text")
+    expr = expr.replace("\\%", "%")
+    expr = expr.replace("\\$", "$")
+    expr = expr.replace("$", "")
+    expr = expr.replace("%", "")
+    expr = expr.replace(" or ", " , ")
+    expr = expr.replace(" and ", " , ")
+    expr = expr.replace("million", "*10^6")
+    expr = expr.replace("billion", "*10^9")
+    expr = expr.replace("trillion", "*10^12")
+    for unit in [
+        "degree",
+        "cm",
+        "centimeter",
+        "meter",
+        "mile",
+        "second",
+        "minute",
+        "hour",
+        "day",
+        "week",
+        "month",
+        "year",
+        "foot",
+        "feet",
+        "inch",
+        "yard",
+    ]:
+        expr = re.sub(f"{unit}(es)?(s)? *(\\^[0-9]+)?", "", expr)
+    expr = re.sub(r"\^ *\\circ", "", expr)
+    if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}":
+        expr = expr[1:-1]
+    expr = re.sub(",\\\\! *", "", expr)
+    if _is_float(expr) and _is_int(float(expr)):
+        expr = str(int(round(float(expr))))
+    if "\\" in expr:
+        try:
+            expr = _parse_latex(expr)
+        except Exception:
+            pass
+    # edge case with mixed numbers and negative signs
+    expr = re.sub("- *", "-", expr)
+    expr = _inject_implicit_mixed_number(expr)
+    expr = expr.replace(" ", "")
+    # if we somehow still have latex braces here, just drop them
+    expr = expr.replace("{", "")
+    expr = expr.replace("}", "")
+    # don't be case sensitive for text answers
+    expr = expr.lower()
+    if _str_is_int(expr):
+        expr = str(_str_to_int(expr))
+    return expr
+def count_unknown_letters_in_expr(expr: str):
+    expr = expr.replace("sqrt", "")
+    expr = expr.replace("frac", "")
+    letters_in_expr = set([x for x in expr if x.isalpha()])
+    return len(letters_in_expr)
+def should_allow_eval(expr: str):
+    # we don't want to try parsing unknown text or functions of more than two variables
+    if count_unknown_letters_in_expr(expr) > 2:
+        return False
+    for bad_string in BAD_SUBSTRINGS:
+        if bad_string in expr:
+            return False
+    for bad_regex in BAD_REGEXES:
+        if re.search(bad_regex, expr) is not None:
+            return False
+    return True
+@timeout_ours(timeout_seconds=5)
+def are_equal_under_sympy(ground_truth_normalized: str, given_normalized: str):
+    are_equal = False
+    try:
+        expr = f"({ground_truth_normalized})-({given_normalized})"
+        if should_allow_eval(expr):
+            sympy_diff = _sympy_parse(expr)
+            simplified = sympy.simplify(sympy_diff)
+            if simplified == 0:
+                are_equal = True
+    except Exception:
+        pass
+    return are_equal
+def split_tuple(expr: str):
+    """
+    Split the elements in a tuple/interval, while handling well-formatted commas in large numbers
+    """
+    expr = _strip_properly_formatted_commas(expr)
+    if len(expr) == 0:
+        return []
+    if (
+        len(expr) > 2
+        and expr[0] in TUPLE_CHARS
+        and expr[-1] in TUPLE_CHARS
+        and all([ch not in expr[1:-1] for ch in TUPLE_CHARS])
+    ):
+        elems = [elem.strip() for elem in expr[1:-1].split(",")]
+    else:
+        elems = [expr]
+    return elems
+def last_boxed_only_string(string):
+    idx = string.rfind("\\boxed")
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+    if right_brace_idx is None:
+        retval = None
+    else:
+        retval = string[idx : right_brace_idx + 1]
+    return retval
+def remove_boxed(s):
+    left = "\\boxed{"
+    try:
+        assert s[: len(left)] == left
+        assert s[-1] == "}"
+        return s[len(left) : -1]
+    except Exception:
+        return None
+def extract_boxed_answer(solution: str) -> str:
+    """Extract the answer from inside a LaTeX \\boxed{} command"""
+    solution = last_boxed_only_string(solution)
+    solution = remove_boxed(solution)
+    return solution
+def grade_answer_sympy(given_answer: str, ground_truth: str) -> bool:
+    ground_truth_normalized = _normalize(ground_truth)
+    given_normalized = _normalize(given_answer)
+    if ground_truth_normalized is None:
+        return False
+    if ground_truth_normalized == given_normalized:
+        return True
+    if len(given_normalized) == 0:
+        return False
+    ground_truth_elems = split_tuple(ground_truth_normalized)
+    given_elems = split_tuple(given_normalized)
+    if len(ground_truth_elems) > 1 and (
+        ground_truth_normalized[0] != given_normalized[0] or ground_truth_normalized[-1] != given_normalized[-1]
+    ):
+        is_correct = False
+    elif len(ground_truth_elems) != len(given_elems):
+        is_correct = False
+    else:
+        for ground_truth_elem, given_elem in zip(ground_truth_elems, given_elems, strict=True):
+            if _is_frac(ground_truth_elem) and _is_frac(given_elem):
+                # if fractions aren't reduced, then shouldn't be marked as correct
+                # so, we don't want to allow sympy.simplify in this case
+                is_correct = ground_truth_elem == given_elem
+            elif _str_is_int(ground_truth_elem) != _str_is_int(given_elem):
+                # if the ground truth answer is an integer, we require the given answer to be a strict match
+                # (no sympy.simplify)
+                is_correct = False
+            else:
+                is_correct = are_equal_under_sympy(ground_truth_elem, given_elem)
+            if not is_correct:
+                break
+    return is_correct
+def grade_answer_mathd(given_answer: str, ground_truth: str) -> bool:
+    ground_truth_normalized_mathd = mathd_normalize_answer(ground_truth)
+    given_answer_normalized_mathd = mathd_normalize_answer(given_answer)
+    # be at least as lenient as mathd
+    if ground_truth_normalized_mathd == given_answer_normalized_mathd:
+        return True
+    return False
+def extract_answer(passage: str) -> str:
+    if "\\boxed" in passage:
+        return extract_boxed_answer(passage)
+    return None
+def grade(model_answer: str, gt_answer: str, fast: bool = True):
+    if "\\boxed" in gt_answer:
+        gt_answer = extract_answer(gt_answer)
+    correct = grade_answer_mathd(model_answer, gt_answer) or grade_answer_sympy(model_answer, gt_answer)
+    if not fast:
+        # This mode further uses math_verify to recall originally false positives.
+        # Will be a bit slower, and sensitive to bad inputs.
+        correct = correct or is_latex_equal(
+            model_answer,
+            gt_answer,
+        )
+    return correct
+def compute_score(model_response, gt_answer, fast=False):
+    model_answer = extract_answer(model_response)
+    if model_answer is None:
+        return {
+            "score": 0.0,
+            "format_score": 0.0,
+            "acc": False,
+            "extracted_gt": gt_answer,
+            # "extracted_pred": None,
+        }
+        # return 0.0, 0.0  # Cannot even parse anything.
+    is_correct = False
+    if isinstance(gt_answer, float) or isinstance(gt_answer, int):
+        gt_answer = str(gt_answer)
+    if isinstance(gt_answer, str):
+        is_correct = grade(model_answer, gt_answer, fast)
+    elif isinstance(gt_answer, list):
+        is_correct = False
+        for gt in gt_answer:
+            is_correct |= grade(model_answer, gt, fast)
+    if is_correct:
+        return {
+            "score": 1.0,
+            "format_score": 1.0,
+            "acc": True,
+            "extracted_gt": gt_answer,
+            # "extracted_pred": None,
+        }
+    else:
+        return {
+            "score": 0.0,
+            "format_score": 1.0,
+            "acc": False,
+            "extracted_gt": gt_answer,
+            # "extracted_pred": None,
+        }

ICL/DAPO/verl-recipe/entropy/reward_score/entropy_math/grader.py ADDED Viewed

	@@ -0,0 +1,384 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright (c) Microsoft Corporation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE
+# Copyright (c) 2023 OpenAI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# Copyright (c) 2021 Dan Hendrycks
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# Copyright 2024 PRIME team and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This logic is largely copied from the Hendrycks' MATH release (math_equivalence), and borrowed from:
+- https://github.com/microsoft/ToRA/blob/main/src/eval/grader.py
+- https://github.com/microsoft/ProphetNet/tree/master/CRITIC
+- https://github.com/openai/prm800k
+"""
+import contextlib
+import math
+import re
+from math import isclose
+# sympy related
+from sympy import N, simplify
+from sympy.parsing.latex import parse_latex
+from sympy.parsing.sympy_parser import parse_expr
+# verl related
+from verl.utils.py_functional import timeout_limit
+def is_digit(s):
+    try:
+        if "{,}" in str(s):
+            num = float(str(s).replace("{,}", ""))
+            return True, num
+        num = float(str(s).replace(",", ""))
+        return True, num
+    except ValueError:
+        return False, None
+def normalize(answer, pi) -> str:
+    # checking if answer is $<number> and removing $ in that case to compare
+    if isinstance(answer, str) and bool(re.match(r"\$\d+(\.\d+)?", answer)):
+        return answer[1:]
+    # checking if answer is <number>% or <number>\\% and removing %
+    if isinstance(answer, str) and (
+        bool(re.match(r"^\d+(\.\d+)?%$", answer)) or bool(re.match(r"^\d+(\.\d+)?\\%$", answer))
+    ):
+        return answer.replace("\\%", "").replace("%", "")
+    # handle base
+    answer = handle_base(answer)
+    # handle pi
+    answer = handle_pi(answer, pi)
+    return answer
+def handle_base(x) -> str:
+    if isinstance(x, str) and "_" in x:
+        # Due to base
+        x = x.split("_")[0]
+        x = float(x)
+        return int(x)
+    return x
+def handle_pi(string, pi):
+    if isinstance(string, str) and "\\pi" in string:
+        # Find the first occurrence of "\pi"
+        idx = string.find("\\pi")
+        # Iterate over the string and find all occurrences of "\pi" with a valid previous character
+        while idx != -1:
+            if idx > 0 and string[idx - 1].isdigit():
+                # Replace "\pi" with "*math.pi" if the previous character is a digit
+                string = string[:idx] + f"*{pi}" + string[idx + 3 :]
+            else:
+                # Replace "\pi" with "1*math.pi" if the previous character is not a digit
+                string = string[:idx] + f"1*{pi}" + string[idx + 3 :]
+            # Find the next occurrence of "\pi"
+            idx = string.find("\\pi", idx + 1)
+        # Evaluate the expression using eval() function
+        with contextlib.suppress(Exception):
+            string = eval(string)
+    return string
+def math_equal(
+    prediction: bool | float | str,
+    reference: float | str,
+    include_percentage: bool = True,
+    tolerance: float = 1e-4,
+    timeout: float = 10.0,
+    pi: float = math.pi,
+) -> bool:
+    """
+    Exact match of math if and only if:
+    1. numerical equal: both can convert to float and are equal
+    2. symbolic equal: both can convert to sympy expression and are equal
+    """
+    prediction = normalize(prediction, pi)
+    reference = normalize(reference, pi)
+    if isinstance(prediction, str) and len(prediction) > 1000:  # handling weird corner-cases
+        prediction = prediction[:1000]
+    # 0. string comparison
+    if isinstance(prediction, str) and isinstance(reference, str):
+        if prediction.strip().lower() == reference.strip().lower():
+            return True
+        if prediction.replace(" ", "") == reference.replace(" ", ""):
+            return True
+    try:  # 1. numerical equal
+        if is_digit(prediction)[0] and is_digit(reference)[0]:
+            prediction = is_digit(prediction)[1]
+            reference = is_digit(reference)[1]
+            # number questions
+            gt_result = [reference / 100, reference, reference * 100] if include_percentage else [reference]
+            for item in gt_result:
+                try:
+                    if isclose(item, prediction, rel_tol=tolerance):
+                        return True
+                except Exception:
+                    continue
+            return False
+    except Exception:
+        pass
+    if not prediction and prediction not in [0, False]:
+        return False
+    # 2. symbolic equal
+    reference = str(reference).strip()
+    prediction = str(prediction).strip()
+    ## deal with [], (), {}
+    prediction = format_intervals(prediction)
+    pred_str, ref_str = prediction, reference
+    if (prediction.startswith("[") and prediction.endswith("]") and not reference.startswith("(")) or (
+        prediction.startswith("(") and prediction.endswith(")") and not reference.startswith("[")
+    ):
+        pred_str = pred_str.strip("[]()")
+        ref_str = ref_str.strip("[]()")
+    for s in ["{", "}", "(", ")"]:
+        ref_str = ref_str.replace(s, "")
+        pred_str = pred_str.replace(s, "")
+    if pred_str == ref_str:
+        return True
+    ## [a, b] vs. [c, d], return a==c and b==d
+    if (
+        prediction
+        and reference
+        and prediction[0] in "(["
+        and prediction[-1] in ")]"
+        and prediction[0] == reference[0]
+        and prediction[-1] == reference[-1]
+    ):
+        pred_parts = prediction[1:-1].split(",")
+        ref_parts = reference[1:-1].split(",")
+        if len(pred_parts) == len(ref_parts) and all(
+            [
+                math_equal(pred_pt, ref_pt, include_percentage, tolerance)
+                for pred_pt, ref_pt in zip(pred_parts, ref_parts, strict=True)
+            ]
+        ):
+            return True
+    if "," in prediction and "," in reference:
+        pred_parts = [item.strip() for item in prediction.split(",")]
+        ref_parts = [item.strip() for item in reference.split(",")]
+        if len(pred_parts) == len(ref_parts):
+            return bool(
+                all(
+                    [
+                        math_equal(pred_parts[i], ref_parts[i], include_percentage, tolerance)
+                        for i in range(len(pred_parts))
+                    ]
+                )
+            )
+    # if we have point == tuple of values
+    if prediction.startswith("Point") and reference[0] == "(" and reference[-1] == ")":
+        pred_parts = prediction[prediction.find("(") + 1 : -1].split(",")
+        ref_parts = reference[1:-1].split(",")
+        if len(pred_parts) == len(ref_parts) and all(
+            [
+                math_equal(pred_pt, ref_pt, include_percentage, tolerance)
+                for pred_pt, ref_pt in zip(pred_parts, ref_parts, strict=True)
+            ]
+        ):
+            return True
+    # if reference is a matrix
+    if r"\begin{pmatrix}" in reference and prediction.startswith("Matrix"):
+        try:
+            pred_matrix = parse_expr(prediction)
+            ref_matrix_items = reference.split()[1:-1:2]
+            if len(pred_matrix) == len(ref_matrix_items) and all(
+                [
+                    math_equal(pred, ref, include_percentage, tolerance)
+                    for ref, pred in zip(ref_matrix_items, pred_matrix, strict=True)
+                ]
+            ):
+                return True
+        except Exception:
+            pass
+    elif r"\begin{pmatrix}" in reference and prediction.startswith("[") and prediction.endswith("]"):
+        if isinstance(eval(prediction), list):
+            try:
+                pred_matrix = eval(prediction)
+                # ref_matrix_items = reference.split()[1:-1:2]
+                ref_matrix_items = (
+                    reference.removeprefix(r"\\begin{pmatrix}")
+                    .removeprefix(r"\begin{pmatrix}")
+                    .removesuffix(r"\\end{pmatrix}")
+                    .removesuffix(r"\end{pmatrix}")
+                )
+                ref_matrix_items = ref_matrix_items.split("\\")
+                ref_matrix_items = [row.split("&") if "&" in row else row for row in ref_matrix_items]
+                if len(pred_matrix) == len(ref_matrix_items) and all(
+                    [
+                        math_equal(pred, ref, include_percentage, tolerance)
+                        for ref, pred in zip(ref_matrix_items, pred_matrix, strict=True)
+                    ]
+                ):
+                    return True
+            except Exception:
+                pass
+    return symbolic_equal(prediction, reference, tolerance, timeout)
+def symbolic_equal(a, b, tolerance, timeout=10.0):
+    def _parse(s):
+        for f in [parse_expr, parse_latex]:
+            try:
+                with timeout_limit(seconds=timeout):
+                    return f(s)
+            except TimeoutError:
+                print(f"Parsing timed out for {s}")
+                continue
+            except Exception:
+                continue
+        return s
+    a = _parse(a)
+    b = _parse(b)
+    try:
+        with timeout_limit(seconds=timeout):
+            if simplify(a - b) == 0:
+                return True
+    except TimeoutError:
+        print(f"Simplification timed out for {a} - {b}")
+        pass
+    except Exception:
+        pass
+    try:
+        with timeout_limit(seconds=timeout):
+            if isclose(N(a), N(b), rel_tol=tolerance):
+                return True
+    except TimeoutError:
+        print(f"Numerical evaluation timed out for {a}, {b}")
+        pass
+    except Exception:
+        pass
+    return False
+def format_intervals(prediction):
+    patterns = {
+        "Interval(": r"^Interval\((.*)\)$",
+        "Interval.Ropen(": r"^Interval\.Ropen\((.*)\)$",
+        "Interval.Lopen(": r"^Interval\.Lopen\((.*)\)$",
+        "Interval.open(": r"^Interval\.open\((.*)\)$",
+    }
+    for key, pattern in patterns.items():
+        match = re.match(pattern, prediction)
+        if match:
+            inner_content = match.group(1)
+            if key == "Interval(":  # Intarval(a, b) == [a, b]
+                return f"[{inner_content}]"
+            elif key == "Interval.Ropen(":  # Intarval.Ropen(a, b) == [a, b)
+                return f"[{inner_content})"
+            elif key == "Interval.Lopen(":  # Intarval.Lopen(a, b) == (a, b]
+                return f"({inner_content}]"
+            elif key == "Interval.open(":  # Intarval.open(a, b) == (a, b)
+                return f"({inner_content})"
+    return prediction

ICL/DAPO/verl-recipe/fapo/config/rm_config.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+hydra:
+  searchpath:
+    - file://verl/trainer/config
+defaults:
+  - ppo_trainer
+  - _self_
+reward_model:
+  _target_: verl.workers.config.RewardModelConfig
+  reward_manager: dapo
+  enable: False
+  # Whether to deploy the model to a separate resource pool.
+  enable_resource_pool: False
+  n_gpus_per_node: 0
+  nnodes: 0
+  model:
+    type: discriminative
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: False
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: true
+    prompt_length: 512
+    response_length: 512

ICL/DAPO/verl-recipe/fapo/run_fapo_genrm_train.sh ADDED Viewed

	@@ -0,0 +1,138 @@

+#!/usr/bin/env bash
+set -xeuo pipefail
+project_name='FAPO-Reproduce'
+exp_name='FAPO-GenRM-4B'
+adv_estimator=grpo
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+max_prompt_length=$((1024 * 5))
+max_response_length=$((1024 * 8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+loss_agg_mode="token-mean"
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+# Ray
+RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-4}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-4B-Instruct-2507"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/train.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/test.parquet"}
+# Algorithm
+temperature=1.2
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_temperature=0.6
+val_top_p=0.95
+# Performance Related Parameter
+sp_size=1
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=1
+fsdp_size=8
+ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
+    --address "${RAY_ADDRESS}" \
+    --working-dir "${WORKING_DIR}" \
+    -- python3 -m verl.trainer.main_ppo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.return_raw_chat=True \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.mode=async \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${val_temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=True \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    custom_reward_function.path=recipe/fapo/reward_fn_genrm.py \
+    custom_reward_function.name=compute_score_fapo_genrm \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=10 \
+    trainer.save_freq=10 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=500 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10

ICL/DAPO/verl-recipe/fapo/runtime_env.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+working_dir: ./
+excludes: ["/.git/"]
+env_vars:
+  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
+  VLLM_USE_V1: "1"

ICL/DAPO/verl-recipe/fault_recover/agent_loop/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .fault_recover_agent_loop import FaultRecoverAgentLoopManager
+from .fault_recover_single_turn_agent_loop import FaultRecoverSingleTurnAgentLoop
+_ = [FaultRecoverSingleTurnAgentLoop, FaultRecoverAgentLoopManager]
+__all__ = ["FaultRecoverSingleTurnAgentLoop", "FaultRecoverAgentLoopManager"]

ICL/DAPO/verl-recipe/fault_recover/agent_loop/fault_recover_single_turn_agent_loop.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from typing import Any
+from uuid import uuid4
+from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
+from verl.tools.utils.tool_registry import initialize_tools_from_config
+from verl.utils.profiler import simple_timer
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+@register("fault_recover_single_turn_agent")
+class FaultRecoverSingleTurnAgentLoop(AgentLoopBase):
+    """Naive agent loop that only do single turn chat completion."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length
+        self.response_length = self.config.actor_rollout_ref.rollout.response_length
+        tool_config_path = self.config.data.tool_config_path
+        tool_list = initialize_tools_from_config(tool_config_path) if tool_config_path else []
+        self.tool_schemas = [tool.tool_schema.model_dump(exclude_unset=True, exclude_none=True) for tool in tool_list]
+    async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
+        messages = list(kwargs["raw_prompt"])
+        # 1. extract images and videos from messages
+        multi_modal_data = await self.process_vision_info(messages)
+        images = multi_modal_data.get("images")
+        videos = multi_modal_data.get("videos")
+        # 2. apply chat template and tokenize
+        prompt_ids = await self.apply_chat_template(
+            messages,
+            tools=self.tool_schemas,
+            images=images,
+            videos=videos,
+        )
+        # 3. generate sequences
+        metrics = {}
+        request_id = uuid4().hex
+        new_token_ids = kwargs.get("new_token_ids", [])
+        finished = kwargs.get("finished", False)
+        num_preempted = kwargs.get("num_preempted")
+        if finished:
+            with simple_timer("generate_sequences", metrics):
+                response_mask = [1] * len(new_token_ids)
+            if metrics.get("num_preempted") is None:
+                metrics["num_preempted"] = num_preempted if num_preempted is not None else -1
+            return AgentLoopOutput(
+                prompt_ids=prompt_ids,
+                response_ids=new_token_ids[: self.response_length],
+                response_mask=response_mask[: self.response_length],
+                response_logprobs=kwargs.get("log_probs"),
+                routed_experts=kwargs.get("routed_experts"),
+                multi_modal_data=multi_modal_data,
+                num_turns=2,
+                metrics=metrics,
+            )
+        origin_prompt_length = len(prompt_ids)
+        prompt_ids += new_token_ids
+        with simple_timer("generate_sequences", metrics):
+            output = await self.server_manager.generate(
+                request_id=request_id,
+                prompt_ids=prompt_ids,
+                sampling_params=sampling_params,
+                image_data=images,
+                video_data=videos,
+                global_id=kwargs.get("global_id"),
+            )
+        if metrics.get("num_preempted") is None:
+            metrics["num_preempted"] = output.num_preempted if output.num_preempted is not None else -1
+        all_token_ids = new_token_ids + output.token_ids
+        response_mask = [1] * len(all_token_ids)
+        output = AgentLoopOutput(
+            prompt_ids=prompt_ids[:origin_prompt_length],
+            response_ids=all_token_ids[: self.response_length],
+            response_mask=response_mask[: self.response_length],
+            response_logprobs=output.log_probs[: self.response_length] if output.log_probs else None,
+            routed_experts=(
+                output.routed_experts[: len(prompt_ids) + self.response_length]
+                if output.routed_experts is not None
+                else None
+            ),
+            multi_modal_data=multi_modal_data,
+            num_turns=2,
+            metrics=metrics,
+        )
+        return output

ICL/DAPO/verl-recipe/fault_recover/config/fault_recover_ppo_megatron_trainer.yaml ADDED Viewed

	@@ -0,0 +1,265 @@

+hydra:
+  searchpath:
+    - file://verl/trainer/config
+# specify the default per-component configs
+defaults:
+  # <folder_name>@<field_name>.<field_name>: <yaml_file_name>
+  # actor_rollout_ref.actor: trainer/config/actor/megatron_actor.yaml
+  - actor@actor_rollout_ref.actor: megatron_actor
+  # data: trainer/config/data/legacy_data.yaml
+  - data@data: legacy_data
+  # (Rule-based) Reward manager config.
+  - reward_manager@reward_manager
+  # load the reference default config, then apply the fields in the current yaml
+  # Reference model config.
+  # Reference model will be enabled when actor.use_kl_loss or/and algorithm.use_kl_in_reward is/are True.
+  - ref@actor_rollout_ref.ref: megatron_ref
+  # Rollout model config.
+  - rollout@actor_rollout_ref.rollout: rollout
+  # Model config.
+  - model@actor_rollout_ref.model: hf_model
+  # Critic model config.
+  - critic@critic: megatron_critic
+  # Reward model config.
+  - reward_model@reward_model: megatron_reward_loop
+  # Rollout correction config.
+  - algorithm@algorithm.rollout_correction: rollout_correction
+  - _self_
+actor_rollout_ref:
+  hybrid_engine: True
+  nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
+  model:
+    override_config:
+      model_config: {}
+      moe_config:
+        freeze_moe_router: False
+    use_fused_kernels: False # Whether to use custom fused kernels (PostProcessing, for memory efficiency)
+    trust_remote_code: False
+    # Whether to remove padding tokens in inputs during training
+    use_remove_padding: false
+    # LoRA (Low-Rank Adaptation) configuration for parameter-efficient fine-tuning
+    lora:
+      # LoRA type: "lora", "vlm_lora", "canonical_lora", or "dora"
+      type: lora
+      # LoRA rank (Dimension of the low-rank projection space.). Set to 0 to disable LoRA
+      rank: 0  # typical values: 8, 16, 32, 64
+      #  Weighting factor for the low-rank projection. Defaults to 32
+      alpha: 32
+      # Dropout rate for the low-rank projection. Defaults to 0.0
+      dropout: 0.0
+      # A list of module names to apply LoRA to.
+      # For fused LoRA, Defaults to all linear layers ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2'].
+      # For canonical LoRA: ["linear_q", "linear_k", "linear_v", "linear_proj", "linear_fc1_up", "linear_fc1_gate", "linear_fc2"]
+      # - 'linear_qkv': Apply LoRA to the fused linear layer used for query, key, and value projections in self-attention
+      # - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention
+      # - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP
+      # - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP
+      # Target modules can also contain wildcards. For example, you can specify
+      # target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv on the first two layers
+      target_modules:
+        - linear_qkv
+        - linear_proj
+        - linear_fc1
+        - linear_fc2
+      # A list of module names not to apply LoRa to. It will match all nn.Linear & nn.Linear-adjacent modules whose name
+      # does not match any string in exclude_modules. If used, will require target_modules to be empty list or None
+      exclude_modules: []
+      # Position for applying dropout, can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'pre'
+      dropout_position: pre
+      # Initialization method for the low-rank matrix A. Defaults to "xavier".
+      lora_A_init_method: xavier
+      # Initialization method for the low-rank matrix B. Defaults to "zero".
+      lora_B_init_method: zero
+      # Enables the experimental All-to-All (A2A) communication strategy. Defaults to False
+      a2a_experimental: False
+      # Parameter data type for LoRA weights. Default to null, which will use model's dtype.
+      dtype: null
+      # Path to pre-trained LoRA adapter weights (null to train from scratch)
+      adapter_path: null
+      # VLMLoRA additionally allows the user to specify whether the language or vision models should be frozen.
+      # For example, a common finetuning workload for multimodal models is to apply adapters to language model and fully
+      # finetune the vision model.
+      freeze_vision_model: True
+      freeze_vision_projection: True
+      freeze_language_model: True
+  rollout:
+    quantization: null
+    layer_name_map:
+      qkv_layer_name: qkv
+      gate_proj_layer_name: gate_up
+custom_reward_function:
+  path: null
+  name: compute_score
+algorithm:
+  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: gae
+  norm_adv_by_std_in_grpo: True
+  use_kl_in_reward: False
+  kl_penalty: kl # how to estimate kl divergence
+  kl_ctrl:
+    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: False
+  pf_ppo:
+    reweight_method: pow # ["pow", "max_min", "max_random"]
+    weight_pow: 2.0
+trainer:
+  balance_batch: True
+  total_epochs: 30
+  total_training_steps: null
+  project_name: verl_examples
+  experiment_name: gsm8k
+  logger: ["console", "wandb"]
+  log_val_generations: 0
+  nnodes: 1
+  n_gpus_per_node: 8
+  save_freq: -1
+  esi_redundant_time: 0
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or disable or resume_path if resume_from_path is set
+  resume_from_path: null
+  del_local_ckpt_after_load: False
+  val_before_train: True
+  test_freq: -1
+  critic_warmup: 0
+  default_hdfs_dir: null
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  max_actor_ckpt_to_keep: null
+  max_critic_ckpt_to_keep: null
+  # The timeout for ray worker group to wait for the register center to be ready
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  # Directory for logging rollout data; no dump if null
+  rollout_data_dir: null
+  # whether to use legacy worker implementation
+  #  mode: "auto", "enable", or "disable"
+  use_legacy_worker_impl: auto
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null # choose between nsys, npu, torch, torch_memory
+  steps: null # profile steps
+  profile_continuous_steps: False
+  save_path: "outputs/profile" # profiler saving path
+  # Specific tool configs, can use +profiler.tool_config.[tool].xxx to config
+  global_tool_config:
+    # nsys config
+    nsys:
+      # True for each task has its own database, False for all tasks in one training step share one database.
+      discrete: False
+      # controller Nvidia Nsight Systems Options. Must set when profile_steps is not None.
+      ## reference https://docs.nvidia.com/nsight-systems/UserGuide/index.html
+      ## reference https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html
+      controller_nsight_options:
+        # Select the API(s) to be traced.
+        trace: "cuda,nvtx,cublas,ucx"
+        # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
+        cuda-memory-usage: "true"
+        # CUDA graphs will be traced as a whole
+        cuda-graph-trace: "graph"
+      # worker Nvidia Nsight Systems Options. Must set when profile_steps is not None.
+      worker_nsight_options:
+        # Select the API(s) to be traced.
+        trace: "cuda,nvtx,cublas,ucx"
+        # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
+        cuda-memory-usage: "true"
+        # CUDA graphs will be traced as a whole
+        cuda-graph-trace: "graph"
+        # Profiling only in a range of torch.cuda.profiler.start and stop. Do not change this config.
+        capture-range: "cudaProfilerApi"
+        # Specify the desired behavior when a capture range ends.
+        # In verl we need the torch.cuda.profiler.start/stop pair to repeats n times.
+        # valid values are "repeat-shutdown:n" or null.
+        # For normal whole step profiling, n = len(profile_steps);
+        # but for discrete profiling, n = len(profile_steps) * Number(subtasks).
+        # Or you can just leave it null and the program will use n = len(profile_steps) * 6;
+        capture-range-end: null
+        # Send signal to the target application's process group. We let the program to exit by itself.
+        kill: none
+    # enable memory visualization for debugging memory usage
+    torch_memory:
+      #  Maximum number of allocation entries to record
+      trace_alloc_max_entries: 100_000
+      # The depth of the call stack to capture for each allocation
+      stack_depth: 32
+      # 'alloc': records only allocation events || 'state': records memory state changes || 'all': records both.
+      context: "all"
+      # 'python': records Python stacks || 'cpp': records C++ stacks (available in some versions) || 'all': records both.
+      stacks: "all"
+      # devices, record_context etc.
+      kw_args: {}
+# configs for TransferQueue
+transfer_queue:
+  # Whether to enable transfer queue
+  enable: False
+ray_kwargs:
+  ray_init:
+    num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
+  timeline_json_file: null
+fault_manager:
+  enable: False
+  # max retry times for other training phases except rollout (restart ray)
+  max_reschedule_times: 1
+  # max retry times for rollout phase (rebuild worker group)
+  max_rebuild_times: 1
+  # timeout of waiting cluster to be ready for reschedule
+  timeout_reschedule: 300
+  # timeout of waiting cluster to be ready for rebuild
+  timeout_rebuild: 300
+  # check chips usage interval during rollout, set -1 to disable timeout check
+  timeout_task_check_interval: 10
+  # timeout of chips usage being free, set -1 to disable chip check and
+  # 'timeout_task_check_interval' will be the whole time limit of rollout
+  # which means you should increase it
+  timeout_chip_free: 30
+  # file path for token saving
+  tokens_save_file: ./tokens_ckpt/tokens.pt
+  # interval of saving tokens to disk
+  tokens_save_interval: 10

ICL/DAPO/verl-recipe/fault_recover/vllm_rollout/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

ICL/DAPO/verl-recipe/fault_recover/vllm_rollout/vllm_async_server.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import inspect
+import logging
+from typing import Any
+import ray
+import vllm
+from packaging import version
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_app,
+    init_app_state,
+)
+from vllm.usage.usage_lib import UsageContext
+from verl.workers.config import HFModelConfig, RolloutConfig
+from verl.workers.rollout.utils import run_unvicorn
+from verl.workers.rollout.vllm_rollout.vllm_async_server import vLLMHttpServer, vLLMReplica
+_VLLM_VERSION = version.parse(vllm.__version__)
+logger = logging.getLogger(__file__)
+logger.setLevel(logging.INFO)
+class FaultRecovervLLMHttpServer(vLLMHttpServer):
+    """vLLM http server in single node, this is equivalent to launch server with command line:
+    ```
+    vllm serve --tensor-parallel-size=8 ...
+    ```
+    """
+    async def run_server(self, args: argparse.Namespace):
+        from recipe.fault_recover.async_llm import AsyncFaultRecoverLLM as AsyncLLM
+        engine_args = AsyncEngineArgs.from_cli_args(args)
+        usage_context = UsageContext.OPENAI_API_SERVER
+        vllm_config = engine_args.create_engine_config(usage_context=usage_context)
+        vllm_config.parallel_config.data_parallel_master_port = self._dp_master_port
+        fn_args = set(dict(inspect.signature(AsyncLLM.from_vllm_config).parameters).keys())
+        kwargs = {}
+        if "enable_log_requests" in fn_args:
+            kwargs["enable_log_requests"] = engine_args.enable_log_requests
+        if "disable_log_stats" in fn_args:
+            kwargs["disable_log_stats"] = engine_args.disable_log_stats
+        engine_client = AsyncLLM.from_vllm_config(vllm_config=vllm_config, usage_context=usage_context, **kwargs)
+        # Don't keep the dummy data in memory
+        await engine_client.reset_mm_cache()
+        await engine_client.collective_rpc(
+            method="monkey_patch_model", kwargs={"vocab_size": len(self.model_config.tokenizer)}
+        )
+        build_app_sig = inspect.signature(build_app)
+        supported_tasks: tuple[Any, ...] = ()
+        if "supported_tasks" in build_app_sig.parameters:
+            supported_tasks = await engine_client.get_supported_tasks()
+            app = build_app(args, supported_tasks)
+        else:
+            app = build_app(args)
+        init_app_sig = inspect.signature(init_app_state)
+        if "vllm_config" in init_app_sig.parameters:
+            await init_app_state(engine_client, vllm_config, app.state, args)
+        elif "supported_tasks" in init_app_sig.parameters:
+            await init_app_state(engine_client, app.state, args, supported_tasks)
+        else:
+            await init_app_state(engine_client, app.state, args)
+        if self.replica_rank == 0 and self.node_rank == 0:
+            logger.info(f"Initializing a V1 LLM engine with config: {vllm_config}")
+        self.engine = engine_client
+        self._server_port, self._server_task = await run_unvicorn(app, args, self._server_address)
+    def clear_engine(self):
+        self.engine.shutdown()
+class FaultRecovervLLMReplica(vLLMReplica):
+    def __init__(
+        self,
+        replica_rank: int,
+        config: RolloutConfig,
+        model_config: HFModelConfig,
+        gpus_per_node: int = 8,
+        is_reward_model: bool = False,
+    ):
+        super().__init__(replica_rank, config, model_config, gpus_per_node, is_reward_model)
+        self.server_class = ray.remote(FaultRecovervLLMHttpServer)

ICL/DAPO/verl-recipe/flowrl/config/flowrl_trainer.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+hydra:
+  searchpath:
+    - file://verl/trainer/config
+defaults:
+  - ppo_trainer
+  - _self_
+data:
+  gen_batch_size: ${data.train_batch_size}
+reward_model:
+  reward_manager: dapo
+  overlong_buffer:
+    enable: False # We try to avoid forgetting to set enable
+    len: 0
+    penalty_factor: 0.0
+    log: False
+algorithm:
+  # _target_: verl.trainer.config.AlgoConfig
+  # # FlowRL trajectory balance coefficient (β)
+  # tb_coef: 15.0
+  filter_groups:
+    _target_: verl.trainer.config.FilterGroupsConfig
+    enable: False # We try to avoid forgetting to set enable
+    metric: null # acc / score / seq_reward / seq_final_reward / ...
+    max_num_gen_batches: 0 # Non-positive values mean no upper limit
+trainer:
+  project_name: verl-flowrl

ICL/DAPO/verl-recipe/flowrl/figures/file.svg ADDED Viewed

ICL/DAPO/verl-recipe/flowrl/figures/flowrl.pdf ADDED Viewed

Binary file (52.5 kB). View file

ICL/DAPO/verl-recipe/flowrl/prepare/prepare_data.sh ADDED Viewed

	@@ -0,0 +1,43 @@

+#!/usr/bin/env bash
+set -uxo pipefail
+export DOWNLOAD_DIR=${DOWNLOAD_DIR:-"downloads"}
+export DATA_DIR=${DATA_DIR:-"${DOWNLOAD_DIR}/data"}
+# Create final data directory
+mkdir -p "${DATA_DIR}"
+# Download DAPO-Math-17k dataset
+DATASET_NAME_TRAIN="BytedTsinghua-SIA/DAPO-Math-17k"
+echo "Downloading ${DATASET_NAME_TRAIN}..."
+huggingface-cli download $DATASET_NAME_TRAIN \
+  --repo-type dataset \
+  --resume-download \
+  --local-dir ${DOWNLOAD_DIR}/${DATASET_NAME_TRAIN} \
+  --local-dir-use-symlinks False
+# Move the parquet file to data directory
+if [ -f "${DOWNLOAD_DIR}/${DATASET_NAME_TRAIN}/data/dapo-math-17k.parquet" ]; then
+  mv "${DOWNLOAD_DIR}/${DATASET_NAME_TRAIN}/data/dapo-math-17k.parquet" "${DATA_DIR}/dapo-math-17k.parquet"
+  echo "✓ Moved dapo-math-17k.parquet to ${DATA_DIR}/"
+fi
+# Download AIME-2024 dataset
+DATASET_NAME_TEST="BytedTsinghua-SIA/AIME-2024"
+echo "Downloading ${DATASET_NAME_TEST}..."
+huggingface-cli download $DATASET_NAME_TEST \
+  --repo-type dataset \
+  --resume-download \
+  --local-dir ${DOWNLOAD_DIR}/${DATASET_NAME_TEST} \
+  --local-dir-use-symlinks False
+# Move the parquet file to data directory
+if [ -f "${DOWNLOAD_DIR}/${DATASET_NAME_TEST}/data/aime-2024.parquet" ]; then
+  mv "${DOWNLOAD_DIR}/${DATASET_NAME_TEST}/data/aime-2024.parquet" "${DATA_DIR}/aime-2024.parquet"
+  echo "✓ Moved aime-2024.parquet to ${DATA_DIR}/"
+fi
+echo ""
+echo "Data preparation completed!"
+echo "Training file: ${DATA_DIR}/dapo-math-17k.parquet"
+echo "Test file: ${DATA_DIR}/aime-2024.parquet"

ICL/DAPO/verl-recipe/flowrl/prepare/prepare_model.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/bin/bash
+MODEL_NAME=Qwen/Qwen2.5-7B
+huggingface-cli download $MODEL_NAME \
+  --repo-type model \
+  --resume-download \
+  --local-dir downloads/models/$MODEL_NAME \
+  --local-dir-use-symlinks False \
+  --exclude *.pth

ICL/DAPO/verl-recipe/gkd/megatron/megatron_utils.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+# Copyright 2025 Individual Contributor: Brilliant Hanabi, furunding
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from megatron.core import parallel_state as mpu
+import verl.utils.megatron.tensor_parallel as tp_utils
+from verl.utils.device import get_device_id
+from verl.utils.megatron_utils import default_tp_concat_fn, unwrap_model
+from verl.utils.model import normalize_model_name
+def per_tensor_generator(
+    actor_module,
+    model_config,
+    weight_converter,
+    transformer_config,
+    layer_name_mapping,
+    convert_qkv_gate_up_by_simple_split=True,
+):
+    tp_rank = mpu.get_tensor_model_parallel_rank()
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    ep_rank = mpu.get_expert_model_parallel_rank()
+    etp_rank = mpu.get_expert_tensor_parallel_rank()
+    ep_size = mpu.get_expert_model_parallel_world_size()
+    etp_size = mpu.get_expert_tensor_parallel_world_size()
+    ep_group = mpu.get_expert_model_parallel_group()
+    etp_group = mpu.get_expert_tensor_parallel_group()
+    vpp_size = len(actor_module)
+    tp_group = mpu.get_tensor_model_parallel_group()
+    tp_size = torch.distributed.get_world_size(group=tp_group)
+    def tensor_generator():
+        for scan_vpp_idx in range(vpp_size):
+            existing_keys = set()
+            model = unwrap_model(actor_module[scan_vpp_idx])
+            for name, param in model.named_parameters():
+                existing_keys.add(name)
+                yield name, param
+            # note
+            # there is a bug in megatron GPTModel
+            # decoder.layers[n].mlp.router.expert_bias" in GPTModel is not registered in named_parameter, but in
+            # state_dict(). for now we patch it by adding those keys to extra_keys.
+            extra_keys = [x for x in model.state_dict().keys() if "_extra_state" not in x and x not in existing_keys]
+            for name in extra_keys:
+                yield name, model.state_dict()[name].to(get_device_id())
+    def get_tensor_spec(tensor):
+        shape = tensor.shape
+        dtype = tensor.dtype
+        tensor_parallel = getattr(tensor, "tensor_model_parallel", None)
+        partition_dim = getattr(tensor, "partition_dim", None)
+        tensor_spec = (shape, dtype, tensor_parallel, partition_dim)
+        return tensor_spec
+    def make_tensor(tensor_spec):
+        tensor = torch.empty(size=tensor_spec[0], dtype=tensor_spec[1], device=get_device_id())
+        if tensor_spec[2] is not None:
+            tensor.tensor_model_parallel = tensor_spec[2]
+        if tensor_spec[3] is not None:
+            tensor.partition_dim = tensor_spec[3]
+        return tensor
+    # we need first make all rank get full model information
+    meta_info = []
+    for scan_vpp_idx in range(vpp_size):
+        existing_keys = set()
+        model = unwrap_model(actor_module[scan_vpp_idx])
+        for idx, (name, param) in enumerate(model.named_parameters()):
+            existing_keys.add(name)
+            meta_info.append((pp_rank, scan_vpp_idx, idx, name, get_tensor_spec(param)))
+        extra_keys = [
+            (x, y) for x, y in model.state_dict().items() if "_extra_state" not in x and x not in existing_keys
+        ]
+        for name, param in extra_keys:
+            meta_info.append((pp_rank, scan_vpp_idx, idx, name, get_tensor_spec(param)))
+    obj_spec_output = [None] * mpu.get_pipeline_model_parallel_world_size()
+    torch.distributed.all_gather_object(
+        object_list=obj_spec_output, obj=meta_info, group=mpu.get_pipeline_model_parallel_group()
+    )
+    layer_list_meta = [item for sublist in obj_spec_output for item in sublist]
+    gen_func = tensor_generator()
+    # lazy load tensor for full model
+    for cur_pp_rank, scan_vpp_idx, idx, name, tensor_spec in layer_list_meta:
+        # fp.write(f"DEBUG: ({cur_pp_rank}, {scan_vpp_idx}, {name})\n")
+        if model_config.tie_word_embeddings and ("output_layers" in name):
+            import warnings
+            warnings.warn(
+                "Current model sharing word and embedding weights, skip output layer conversion", stacklevel=2
+            )
+            continue
+        cur_name = normalize_model_name(name, cur_pp_rank, scan_vpp_idx, transformer_config)
+        if cur_pp_rank == pp_rank:
+            _, cur_tensor = next(gen_func)
+        else:
+            cur_tensor = None
+        if pp_rank == 0:
+            if cur_tensor is None:
+                cur_tensor = make_tensor(tensor_spec)
+                torch.distributed.recv(cur_tensor, group=mpu.get_pipeline_model_parallel_group(), group_src=cur_pp_rank)
+        else:
+            if cur_tensor is None:
+                cur_tensor = make_tensor(tensor_spec)
+            else:
+                torch.distributed.send(cur_tensor, group=mpu.get_pipeline_model_parallel_group(), group_dst=0)
+        # (xya): this is a hack to fix the name of the parameters
+        while cur_name.startswith("module."):
+            cur_name = cur_name[len("module.") :]
+        def gather(tensor, gather_list, group, group_dst, group_rank):
+            if group_rank == group_dst:
+                torch.distributed.gather(tensor, gather_list, group=group, group_dst=group_dst)
+            else:
+                torch.distributed.gather(tensor, None, group=group, group_dst=group_dst)
+        # EP
+        if ".mlp.experts.linear_fc" in cur_name and ep_size > 1:
+            num_experts = weight_converter.mcore_config.num_moe_experts
+            num_experts_per_rank = num_experts // ep_size
+            infer_params = [torch.empty_like(cur_tensor) for _ in range(ep_size)]
+            gather(cur_tensor, infer_params, group=ep_group, group_dst=0, group_rank=ep_rank)
+            name_prefix, local_expert_id = cur_name.split(".weight")
+            local_expert_id = int(local_expert_id)
+            global_expert_ids = [num_experts_per_rank * _ep_rank + local_expert_id for _ep_rank in range(ep_size)]
+            global_expert_names = [f"{name_prefix}.weight{expert_id}" for expert_id in global_expert_ids]
+            for name, param in zip(global_expert_names, infer_params, strict=True):
+                if etp_size > 1:
+                    # gather etp
+                    etp_params = [torch.empty_like(param) for _ in range(etp_size)]
+                    gather(param, etp_params, group=etp_group, group_dst=0, group_rank=etp_rank)
+                    params = etp_params
+                else:
+                    params = [param]
+                merge_params = default_tp_concat_fn(
+                    layer_name_mapping,
+                    name,
+                    cur_tensor,
+                    params,
+                    model_config,
+                    weight_converter.hf_config,
+                    convert_qkv_gate_up_by_simple_split,
+                )
+                if not isinstance(merge_params, list):
+                    merge_params = [merge_params]
+                converted_names, converted_params = weight_converter.convert_param(name, merge_params)
+                yield from zip(converted_names, [param.detach() for param in converted_params], strict=True)
+            continue
+        # tp all gather
+        if tp_utils.is_tensor_parallel_param(cur_tensor):
+            # allocate a new tensor with proper size
+            if tp_size <= 1:
+                infer_params = [cur_tensor]
+            else:
+                infer_params = [torch.empty_like(cur_tensor) for _ in range(tp_size)]
+                gather(cur_tensor, infer_params, tp_group, group_dst=0, group_rank=tp_rank)
+            infer_params = default_tp_concat_fn(
+                layer_name_mapping,
+                cur_name,
+                cur_tensor,
+                infer_params,
+                model_config,
+                weight_converter.hf_config,
+                convert_qkv_gate_up_by_simple_split,
+            )
+        else:
+            infer_params = cur_tensor
+        if not isinstance(infer_params, list):
+            infer_params = [infer_params]
+        converted_names, converted_params = weight_converter.convert_param(cur_name, infer_params)
+        yield from zip(converted_names, [param.detach() for param in converted_params], strict=True)

ICL/DAPO/verl-recipe/gvpo/config/gvpo_trainer.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+hydra:
+  searchpath:
+    - file://verl/trainer/config
+defaults:
+  - ppo_trainer
+  - _self_
+actor_rollout_ref:
+  actor:
+    _target_: recipe.gvpo.gvpo_actor_config.FSDPActorConfig
+    gvpo_beta: 0.1
+trainer:
+  project_name: gvpo

ICL/DAPO/verl-recipe/langgraph_agent/example/README.md ADDED Viewed

	@@ -0,0 +1,138 @@

+# MathExpression: LangGraph Agent Example
+MathExpression is a tiny example to demonstrate multi-turn rollout with [LangGraph ReactAgent](https://langchain-ai.github.io/langgraph/agents/overview/).
+### Define react agent with tool
+Firstly, to force ReactAgent to evaluate math expression by tool, we define a special operand `@`:
+```python
+@tool(parse_docstring=True)
+def calculate(a: int, b: int, operand: str) -> int:
+    """
+    Compute the results using operand with two integers
+    Args:
+        a: the first operand
+        b: the second operand
+        operand: '+' or '-' or '*' or '@'
+    """
+    assert operand in ["+", "-", "*", "@"], f"unknown operand {operand}"
+    if operand == "@":
+        return 3 * a - 2 * b
+    return eval(f"{a} {operand} {b}")
+```
+Without calling `calculate`, ReactAgent is impossible to evaluate math expression correctly.
+Then, we can equip ReactAgent with `calculate` tool:
+```python
+class MathExpressionReactAgentLoop(ReactAgentLoop):
+    @classmethod
+    def init_class(cls, config, tokenizer):
+        cls.tools = [calculate]
+        super().init_class(config, tokenizer)
+```
+We can define agent loop config in yaml file, which will be used by AgentLoopWorker to dynamic load custom AgentLoop class.
+```yaml
+- name: math_expression
+  _target_: recipe.langgraph_agent.example.math_expression.MathExpressionReactAgentLoop
+```
+### Prepare dataset
+Now, let's prepare two small datasets for training and evaluation:
+```bash
+python recipe/langgraph_agent/example/create_dataset.py
+```
+- Parameters: `--train_size` (default: 5000), `--test_size` (default: 500), `--output_dir` (default: `data/math_expression_tool`).
+- Example with custom sizes/output:
+```bash
+python recipe/langgraph_agent/example/create_dataset.py \
+  --train_size 10000 \
+  --test_size 1000 \
+  --output_dir data/math_expression_tool
+```
+Note that dataset should contain a column `agent_name` with `math_expression`, which is used by `AgentLoopWorker` to select the
+agent loop class.
+| prompt | reward_model | agent_name |
+|--------------------------------------|------------------------------|-----------------|
+| [{'role': 'user', 'content': '...'}] | {'ground_truth': '-10', ...} | math_expression |
+| [{'role': 'user', 'content': '...'}] | {'ground_truth': '-10', ...} | math_expression |
+Generated math expressions are like below, requiring model to call `calculate` multiple times to solve sub expressions.
+```
+(2 @ (8 @ 8 @ 5 @ 5 @ 3) @ 6 @ (1 @ 4 @ 4 @ 4) @ 2) @ 6
+(4.6 @ (9.05 @ 4.0) @ 8.3 @ 1.21) @ 8.6
+9 @ 4
+((2 @ 2) @ (3 @ 3)) @ 4
+```
+### Training
+Hook all these up and start training:
+```bash
+bash recipe/langgraph_agent/example/run_qwen2.5_3b.sh 2>&1 | tee train.log
+```
+To submit on a SLURM cluster (the script contains SBATCH headers):
+```bash
+sbatch recipe/langgraph_agent/example/run_qwen2.5_3b.sh
+```
+**Note on `GPUS_PER_NODE` and `NNODES`:**
+- `GPUS_PER_NODE`: GPUs per node.
+  Detection order: `SLURM_GPUS_ON_NODE` (if set) → `GPUS_PER_NODE` → `2`.
+- `NNODES`: number of nodes.
+  Detection order: `SLURM_JOB_NUM_NODES` (if set) → `NNODES` → `1`.
+- Total GPUs = `GPUS_PER_NODE × NNODES` (must be ≥ 2).
+Local override (no `SLURM_*` set):
+```bash
+GPUS_PER_NODE=4 NNODES=2 bash recipe/langgraph_agent/example/run_qwen2.5_3b.sh
+```
+After total 39 steps, model should achieve 100% accuray on test dataset:
+- val-aux/lighteval/MATH/reward: 1.0
+- val-aux/num_turns/mean: 9.0, average number of messages include assistant and tool turns.
+We can see model can solve math expression by calling `calculate` tool:
+```
+system
+You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
+user
+We define a new math operator @, where you can only call an external tool to compute. Please put your final answer inside \boxed{} only in the last turn. Now answer the following questions:
+Compute 5 @ (3 @ 6 @ 8)
+assistant
+Sure, let's break this down step by step. We need to compute the expression \(5 @ (3 @ 6 @ 8)\). First, we need to compute the inner expression \(3 @ 6 @ 8\).
+Let's compute \(3 @ 6\) first using the external tool.
+<tool_call>
+{"name": "calculate", "arguments": {"a": 3, "b": 6, "operand": "@"}}
+</tool_call>user
+<tool_response>
+-3
+</tool_response>
+assistant
+We have computed that \(3 @ 6 = -3\). Now we need to compute \(-3 @ 8\).
+Let's use the external tool to compute this.
+<tool_call>
+{"name": "calculate", "arguments": {"a": -3, "b": 8, "operand": "@"}}
+</tool_call>user
+<tool_response>
+-25
+</tool_response>
+assistant
+We have computed that \(-3 @ 8 = -25\). Now we can put this result into the original expression to get the final result of \(5 @ (3 @ 6 @ 8)\), which is \(5 @ (-25)\).
+Let's compute \(5 @ (-25)\) using the external tool.
+<tool_call>
+{"name": "calculate", "arguments": {"a": 5, "b": -25, "operand": "@"}}
+</tool_call>user
+<tool_response>
+65
+</tool_response>
+assistant
+The final result of \(5 @ (3 @ 6 @ 8)\) is \(\boxed{65}\).
+```

ICL/DAPO/verl-recipe/langgraph_agent/example/agent.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ - name: math_expression
2	+ _target_: recipe.langgraph_agent.example.math_expression.MathExpressionReactAgentLoop

ICL/DAPO/verl-recipe/langgraph_agent/example/create_dataset.py ADDED Viewed

	@@ -0,0 +1,290 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Create dataset for calculator
+"""
+import argparse
+import os
+import random
+import pandas as pd
+def generate_math_expression(min_terms=2, max_terms=5, min_number=1, max_number=10, allow_decimals=False, max_depth=2):
+    """
+    Generate a random mathematical expression with operators +, -, *, /, and parentheses.
+    Args:
+        min_terms (int): Minimum number of terms in the expression.
+        max_terms (int): Maximum number of terms in the expression.
+        max_number (int): Maximum value for numbers in the expression.
+        allow_decimals (bool): Whether to allow decimal numbers.
+        max_depth (int): Maximum nesting depth for parentheses.
+    Returns:
+        str: A valid mathematical expression as a string.
+    """
+    def generate_number():
+        """Generate a random number (integer or float)."""
+        assert min_number < max_number
+        num = random.uniform(min_number, max_number)
+        if not allow_decimals:
+            num = int(num)
+        else:
+            num = round(num, random.randint(0, 2))  # Round to 0-2 decimal places
+        return str(num)
+    def generate_term(depth=0):
+        """Generate a term (number or parenthesized expression)."""
+        if depth < max_depth and random.random() < 0.5:  # 50% chance to add parentheses
+            expr = generate_expression(depth + 1)
+            return f"({expr})"
+        else:
+            return generate_number()
+    def generate_expression(depth=0):
+        """Generate a full expression with multiple terms and operators."""
+        num_terms = random.randint(min_terms, max_terms)
+        terms = [generate_term(depth) for _ in range(num_terms)]
+        # Randomly select operators
+        operators = ["+", "-", "*", "/", "@"]
+        expr = terms[0]
+        for i in range(1, num_terms):
+            # Bias towards + and - for readability
+            op = random.choices(
+                operators,
+                weights=[0, 0, 0, 0, 1],  # + and - are 1.5x more likely than * and /
+            )[0]
+            expr += f" {op} " + terms[i]
+        return expr
+    return generate_expression()
+def test():
+    # Example 1: Basic integer expression
+    print(generate_math_expression())
+    # Output: (3 + 7) * 2 - 5
+    # Example 2: Expression with decimals
+    print(generate_math_expression(allow_decimals=True))
+    # Output: 4.5 / (2.1 + 3.7) - 1.2
+    # Example 3: More complex expression with higher depth
+    print(generate_math_expression(max_terms=6, max_depth=3))
+    # Output: ((5 * 2) - (3 + 1)) / (7 - 2) + 4
+    # Example 4: Simplified expression
+    print(generate_math_expression(min_terms=2, max_terms=3, max_number=5))
+    # Output: 4 - 2 * 3
+def calculate(expression: str) -> float:
+    """
+    Evaluate a mathematical expression with +, -, *, /, @, and parentheses.
+    The @ operator is defined as: a @ b = 3a - 2b.
+    Args:
+        expression (str): Input mathematical expression (e.g., "3@2+4").
+    Returns:
+        float: Result of the evaluated expression.
+    Raises:
+        ValueError: For invalid expressions (e.g., mismatched parentheses, division by zero).
+    """
+    def tokenize(s: str) -> list:
+        """Convert the input string into tokens (numbers, operators, parentheses)."""
+        tokens = []
+        i = 0
+        while i < len(s):
+            if s[i].isdigit() or s[i] == ".":
+                # Parse number (integer or float)
+                j = i
+                while j < len(s) and (s[j].isdigit() or s[j] == "."):
+                    j += 1
+                tokens.append(s[i:j])
+                i = j
+            elif s[i] in "+-*/@()":
+                # Operator or parenthesis
+                tokens.append(s[i])
+                i += 1
+            elif s[i].isspace():
+                # Skip whitespace
+                i += 1
+            else:
+                raise ValueError(f"Invalid character: {s[i]}")
+        return tokens
+    def infix_to_postfix(tokens: list) -> list:
+        """Convert infix notation to postfix notation (Reverse Polish Notation)."""
+        output = []
+        stack = []
+        # Higher precedence for @ (between * and +)
+        precedence = {"@": 3, "*": 2, "/": 2, "+": 1, "-": 1}
+        for token in tokens:
+            if token.isdigit() or "." in token:
+                output.append(token)
+            elif token == "(":
+                stack.append(token)
+            elif token == ")":
+                while stack and stack[-1] != "(":
+                    output.append(stack.pop())
+                if not stack or stack[-1] != "(":
+                    raise ValueError("Mismatched parentheses")
+                stack.pop()  # Discard '('
+            else:  # Operator
+                while stack and stack[-1] != "(" and precedence.get(stack[-1], 0) >= precedence.get(token, 0):
+                    output.append(stack.pop())
+                stack.append(token)
+        # Pop remaining operators
+        while stack:
+            if stack[-1] in "()":
+                raise ValueError("Mismatched parentheses")
+            output.append(stack.pop())
+        return output
+    def evaluate_postfix(postfix: list) -> float:
+        """Evaluate postfix expression using a stack."""
+        stack = []
+        for token in postfix:
+            if token.isdigit() or "." in token:
+                stack.append(float(token))
+            else:
+                if len(stack) < 2:
+                    raise ValueError("Invalid expression")
+                b = stack.pop()
+                a = stack.pop()
+                if token == "+":
+                    res = a + b
+                elif token == "-":
+                    res = a - b
+                elif token == "*":
+                    res = a * b
+                elif token == "/":
+                    if b == 0:
+                        raise ValueError("Division by zero")
+                    res = a / b
+                elif token == "@":
+                    res = 3 * a - 2 * b  # Custom @ operator implementation
+                else:
+                    raise ValueError(f"Invalid operator: {token}")
+                stack.append(res)
+        if len(stack) != 1:
+            raise ValueError("Invalid expression")
+        return stack[0]
+    # Remove spaces and validate parentheses
+    expression = expression.replace(" ", "")
+    if expression.count("(") != expression.count(")"):
+        raise ValueError("Mismatched parentheses")
+    tokens = tokenize(expression)
+    postfix = infix_to_postfix(tokens)
+    result = evaluate_postfix(postfix)
+    # Convert integers to integer representation
+    if result.is_integer():
+        return int(result)
+    return result
+def generate_data(total_num_dataset, split, agent_name="math_expression"):
+    rl_dataset = {
+        "prompt": [],
+        "data_source": [],
+        "ability": [],
+        "reward_model": [],
+        "extra_info": [],
+        "agent_name": [],
+    }
+    for idx in range(total_num_dataset):
+        while True:
+            try:
+                expression: str = generate_math_expression(
+                    min_terms=2, max_terms=3, min_number=1, max_number=10, allow_decimals=False, max_depth=1
+                )
+                num_plus = expression.count("+")
+                num_minus = expression.count("-")
+                num_mul = expression.count("*")
+                num_star = expression.count("@")
+                answer = str(calculate(expression))
+                # answer = str(eval(expression))
+                break
+            except Exception as e:
+                print(e)
+                continue
+        num_tool_calls = num_plus + num_minus + num_mul + num_star
+        prompt = (
+            f"We define a new math operator @, where you can only call an external tool to compute. "
+            f"Please put your final answer inside \\boxed{{}} only in the last turn. Now answer the "
+            f"following questions:\nCompute {expression}"
+        )
+        prompt_with_template = [
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ]
+        rl_dataset["prompt"].append(prompt_with_template)
+        rl_dataset["data_source"].append("lighteval/MATH")
+        rl_dataset["ability"].append("math")
+        rl_dataset["reward_model"].append({"style": "lighteval/MATH", "ground_truth": answer})
+        rl_dataset["extra_info"].append(
+            {"index": idx, "expression": expression, "split": split, "expected_tool_calls": num_tool_calls}
+        )
+        rl_dataset["agent_name"].append(agent_name)
+    rl_dataset = pd.DataFrame(data=rl_dataset)
+    return rl_dataset
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Math Expression Dataset Generator")
+    parser.add_argument("--train_size", type=int, default=5000, help="Number of training samples")
+    parser.add_argument("--test_size", type=int, default=500, help="Number of testing samples")
+    parser.add_argument("--output_dir", default="data/math_expression_tool", help="Directory to save the dataset")
+    parser.add_argument("--agent_name", default="math_expression", help="Name of the agent")
+    args = parser.parse_args()
+    # print(calculate("3@2"))          # Output: 5 (3*3 - 2*2)
+    # print(calculate("3@2+4"))        # Output: 9 (5 + 4)
+    # print(calculate("3*(4@2)"))      # Output: 24 (3 * 8)
+    # print(calculate("(5@3)*2"))      # Output: 18 (9 * 2)
+    train_dataset = generate_data(total_num_dataset=args.train_size, split="train", agent_name=args.agent_name)
+    test_dataset = generate_data(total_num_dataset=args.test_size, split="test", agent_name=args.agent_name)
+    # Make sure the dataset directory exists
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Save the datasets to parquet files
+    train_dataset.to_parquet(os.path.join(args.output_dir, "train.parquet"))
+    test_dataset.to_parquet(os.path.join(args.output_dir, "test.parquet"))

ICL/DAPO/verl-recipe/langgraph_agent/example/math_expression.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from langchain_core.tools import tool
+from recipe.langgraph_agent.react_agent_loop import ReactAgentLoop
+@tool(parse_docstring=True)
+def calculate(a: int, b: int, operand: str) -> int:
+    """
+    Compute the results using operand with two integers
+    Args:
+        a: the first operand
+        b: the second operand
+        operand: '+' or '-' or '*' or '@'
+    """
+    assert operand in ["+", "-", "*", "@"], f"unknown operand {operand}"
+    if operand == "@":
+        return 3 * a - 2 * b
+    return eval(f"{a} {operand} {b}")
+class MathExpressionReactAgentLoop(ReactAgentLoop):
+    @classmethod
+    def init_class(cls, config, tokenizer, **kwargs):
+        cls.tools = [calculate]
+        super().init_class(config, tokenizer)

ICL/DAPO/verl-recipe/langgraph_agent/example/run_gpt_oss_20b_bf16.sh ADDED Viewed

	@@ -0,0 +1,143 @@

+#!/usr/bin/env bash
+#SBATCH --job-name=rl-langgraph-3B
+#SBATCH --partition=main
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=64
+#SBATCH --gres=gpu:4
+#SBATCH --mem=0
+#SBATCH --time=10:00:00
+#SBATCH --output=%x_%j.out
+#SBATCH --error=%x_%j.err
+set -xeuo pipefail
+# ================= cluster topology =================
+export GPUS_PER_NODE=${SLURM_GPUS_ON_NODE:-${GPUS_PER_NODE:-2}}  # GPUs on this node
+NNODES=${SLURM_JOB_NUM_NODES:-${NNODES:-1}}
+export NNODES
+export RAY_NUM_NODES=$NNODES
+# Require at least 2 GPUs
+TOTAL_GPUS=$((GPUS_PER_NODE * NNODES))
+if [ "$TOTAL_GPUS" -lt 2 ]; then
+  echo "Error: at least 2 GPUs are required, detected $TOTAL_GPUS." >&2
+  exit 1
+fi
+echo "Using $NNODES nodes and $GPUS_PER_NODE GPUs per node..."
+# ================= data/model/tool =================
+HDFS_ROOT=${HDFS_ROOT:-$PWD}
+DATA_ROOT=${DATA_ROOT:-$PWD}
+# Prefer local model if present, otherwise fall back to HF hub path
+model_path="lmsys/gpt-oss-20b-bf16"
+# Use the default output directory produced by create_dataset.py
+train_files=$DATA_ROOT/data/math_expression_tool/train.parquet
+test_files=$DATA_ROOT/data/math_expression_tool/test.parquet
+# Agent config
+agent_loop_config_path=recipe/langgraph_agent/example/agent.yaml
+# =================== wandb ===================
+project_name=math_expression_tool
+experiment_name=gpt-oss-20b-bf16
+default_local_dir=$DATA_ROOT/checkpoint/$experiment_name
+# ================= algorithm =================
+adv_estimator=grpo
+use_kl_in_reward=false
+kl_coef=0.0
+use_kl_loss=false
+kl_loss_coef=0.0
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+max_turns=8
+max_prompt_length=1024
+max_response_length=8192
+actor_lr=1e-6
+train_batch_size=128
+ppo_mini_batch_size=16
+n_resp_per_prompt=8
+n_resp_per_prompt_val=1
+# =================== logging ===================
+export RAY_LOGGING_LEVEL=DEBUG
+export HYDRA_FULL_ERROR=1
+# ================= performance =================
+export NCCL_IBEXT_DISABLE=1
+export NCCL_NVLS_ENABLE=1
+export NCCL_IB_HCA=mlx5
+export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1
+export VLLM_USE_V1=1
+export VLLM_ATTENTION_BACKEND=FLASH_ATTN
+infer_tp=2  # vLLM tensor parallel size
+train_sp=4  # Ulysses sequence parallel size for actor
+offload=true
+actor_max_token_len_per_gpu=$(( (max_prompt_length + max_response_length) * 4 ))
+log_prob_max_token_len_per_gpu=$(( actor_max_token_len_per_gpu * 2 ))
+train_files="['$train_files']"
+test_files="['$test_files']"
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=$adv_estimator \
+    algorithm.use_kl_in_reward=$use_kl_in_reward \
+    algorithm.kl_ctrl.kl_coef=$kl_coef \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.return_raw_chat=true \
+    data.train_batch_size=$train_batch_size \
+    data.max_prompt_length=$max_prompt_length \
+    data.max_response_length=$max_response_length \
+    data.filter_overlong_prompts=true \
+    data.truncation='error' \
+    actor_rollout_ref.model.path="$model_path" \
+    actor_rollout_ref.model.use_remove_padding=true \
+    actor_rollout_ref.model.enable_gradient_checkpointing=true \
+    actor_rollout_ref.actor.use_kl_loss=$use_kl_loss \
+    actor_rollout_ref.actor.kl_loss_coef=$kl_loss_coef \
+    actor_rollout_ref.actor.clip_ratio_low=$clip_ratio_low \
+    actor_rollout_ref.actor.clip_ratio_high=$clip_ratio_high \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.actor.optim.lr=$actor_lr \
+    actor_rollout_ref.actor.use_dynamic_bsz=true \
+    actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$actor_max_token_len_per_gpu \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=$train_sp \
+    actor_rollout_ref.actor.fsdp_config.param_offload=$offload \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=$offload \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=$log_prob_max_token_len_per_gpu \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.mode=async \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=$infer_tp \
+    actor_rollout_ref.rollout.multi_turn.max_user_turns=$max_turns \
+    actor_rollout_ref.rollout.multi_turn.max_assistant_turns=$max_turns \
+    actor_rollout_ref.rollout.multi_turn.format=gpt-oss \
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend=triton \
+    actor_rollout_ref.rollout.agent.agent_loop_config_path=$agent_loop_config_path \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
+    actor_rollout_ref.rollout.n=$n_resp_per_prompt \
+    actor_rollout_ref.rollout.val_kwargs.top_p=1.0\
+    actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
+    actor_rollout_ref.rollout.val_kwargs.n=$n_resp_per_prompt_val \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name=$project_name \
+    trainer.experiment_name=$experiment_name \
+    trainer.n_gpus_per_node="$GPUS_PER_NODE" \
+    trainer.val_before_train=true \
+    trainer.log_val_generations=50 \
+    trainer.nnodes="$NNODES" \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="$default_local_dir" \
+    trainer.test_freq=5 \
+    trainer.total_epochs=1 "$@"

ICL/DAPO/verl-recipe/langgraph_agent/example/run_qwen2.5_3b.sh ADDED Viewed

	@@ -0,0 +1,145 @@

+#!/usr/bin/env bash
+#SBATCH --job-name=rl-langgraph-3B
+#SBATCH --partition=main
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=64
+#SBATCH --gres=gpu:4
+#SBATCH --mem=0
+#SBATCH --time=10:00:00
+#SBATCH --output=%x_%j.out
+#SBATCH --error=%x_%j.err
+set -xeuo pipefail
+# ================= cluster topology =================
+export GPUS_PER_NODE=${SLURM_GPUS_ON_NODE:-${GPUS_PER_NODE:-2}}  # GPUs on this node
+NNODES=${SLURM_JOB_NUM_NODES:-${NNODES:-1}}
+export NNODES
+export RAY_NUM_NODES=$NNODES
+# Require at least 2 GPUs
+TOTAL_GPUS=$((GPUS_PER_NODE * NNODES))
+if [ "$TOTAL_GPUS" -lt 2 ]; then
+  echo "Error: at least 2 GPUs are required, detected $TOTAL_GPUS." >&2
+  exit 1
+fi
+echo "Using $NNODES nodes and $GPUS_PER_NODE GPUs per node..."
+# ================= data/model/tool =================
+HDFS_ROOT=${HDFS_ROOT:-$PWD}
+DATA_ROOT=${DATA_ROOT:-$PWD}
+# Prefer local model if present, otherwise fall back to HF hub path
+model_path=${model_path:-$DATA_ROOT/model/Qwen2.5-3B-Instruct}
+if [ ! -d "$model_path" ]; then
+  model_path=Qwen/Qwen2.5-3B-Instruct
+fi
+# Use the default output directory produced by create_dataset.py
+train_files=$DATA_ROOT/data/math_expression_tool/train.parquet
+test_files=$DATA_ROOT/data/math_expression_tool/test.parquet
+# Agent config
+agent_loop_config_path=recipe/langgraph_agent/example/agent.yaml
+# =================== wandb ===================
+project_name=math_expression_tool
+experiment_name=qwen2.5-3b
+default_local_dir=$DATA_ROOT/checkpoint/$experiment_name
+# ================= algorithm =================
+adv_estimator=grpo
+use_kl_in_reward=false
+kl_coef=0.0
+use_kl_loss=false
+kl_loss_coef=0.0
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+max_turns=8
+max_prompt_length=1024
+max_response_length=2048
+actor_lr=1e-6
+train_batch_size=128
+ppo_mini_batch_size=16
+n_resp_per_prompt=8
+n_resp_per_prompt_val=1
+# =================== logging ===================
+export RAY_LOGGING_LEVEL=DEBUG
+export HYDRA_FULL_ERROR=1
+# ================= performance =================
+export NCCL_IBEXT_DISABLE=1
+export NCCL_NVLS_ENABLE=1
+export NCCL_IB_HCA=mlx5
+export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1
+export VLLM_USE_V1=1
+export VLLM_ATTENTION_BACKEND=FLASH_ATTN
+infer_tp=2  # vLLM tensor parallel size
+train_sp=4  # Ulysses sequence parallel size for actor
+offload=true
+actor_max_token_len_per_gpu=$(( (max_prompt_length + max_response_length) * 4 ))
+log_prob_max_token_len_per_gpu=$(( actor_max_token_len_per_gpu * 2 ))
+train_files="['$train_files']"
+test_files="['$test_files']"
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=$adv_estimator \
+    algorithm.use_kl_in_reward=$use_kl_in_reward \
+    algorithm.kl_ctrl.kl_coef=$kl_coef \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.return_raw_chat=true \
+    data.train_batch_size=$train_batch_size \
+    data.max_prompt_length=$max_prompt_length \
+    data.max_response_length=$max_response_length \
+    data.filter_overlong_prompts=true \
+    data.truncation='error' \
+    actor_rollout_ref.model.path="$model_path" \
+    actor_rollout_ref.model.use_remove_padding=true \
+    actor_rollout_ref.model.enable_gradient_checkpointing=true \
+    actor_rollout_ref.actor.use_kl_loss=$use_kl_loss \
+    actor_rollout_ref.actor.kl_loss_coef=$kl_loss_coef \
+    actor_rollout_ref.actor.clip_ratio_low=$clip_ratio_low \
+    actor_rollout_ref.actor.clip_ratio_high=$clip_ratio_high \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.actor.optim.lr=$actor_lr \
+    actor_rollout_ref.actor.use_dynamic_bsz=true \
+    actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$actor_max_token_len_per_gpu \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=$train_sp \
+    actor_rollout_ref.actor.fsdp_config.param_offload=$offload \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=$offload \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=$log_prob_max_token_len_per_gpu \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.mode=async \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=$infer_tp \
+    actor_rollout_ref.rollout.multi_turn.max_user_turns=$max_turns \
+    actor_rollout_ref.rollout.multi_turn.max_assistant_turns=$max_turns \
+    actor_rollout_ref.rollout.multi_turn.format=hermes \
+    actor_rollout_ref.rollout.agent.agent_loop_config_path=$agent_loop_config_path \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    actor_rollout_ref.rollout.n=$n_resp_per_prompt \
+    actor_rollout_ref.rollout.val_kwargs.top_p=0.6 \
+    actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
+    actor_rollout_ref.rollout.val_kwargs.n=$n_resp_per_prompt_val \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name=$project_name \
+    trainer.experiment_name=$experiment_name \
+    trainer.n_gpus_per_node="$GPUS_PER_NODE" \
+    trainer.val_before_train=true \
+    trainer.log_val_generations=50 \
+    trainer.nnodes="$NNODES" \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="$default_local_dir" \
+    trainer.test_freq=5 \
+    trainer.total_epochs=1 "$@"

ICL/DAPO/verl-recipe/open_math_reasoning/run_eval.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/usr/bin/env bash
+# Evaluation
+python3 -m verl.trainer.main_eval \
+    data.path=$HOME/data/gen/qwen_8b_gen_test.parquet \
+    custom_reward_function.path=recipe/open_math_reasoning/compute_score.py \
+    custom_reward_function.name=compute_score_data_source

ICL/DAPO/verl-recipe/prime/config/prime_trainer.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+# the prime config will override default ppo_trainer.yaml
+hydra:
+  searchpath:
+    - file://verl/trainer/config
+defaults:
+  - ppo_trainer
+  - _self_
+data:
+  filter_accuracy: True
+  accuracy_lower_bound: 0.2
+  accuracy_upper_bound: 0.8
+  oversample_factor: 4.0 # Sample more responses than the batch size. prompts satisfying the filter will be prioritized.
+  filter_truncate: True
+  truncation: right
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    use_remove_padding: True
+  rollout:
+    mode: sync
+    # number of responses (i.e. num sample times)
+    n: 4
+  actor:
+    entropy_coeff: 0.001
+reward_model:
+  enable: True
+  strategy: fsdp
+  model:
+    ref_path: ${reward_model.model.path}
+    use_remove_padding:  True
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fused_kernel_options:
+      impl_backend: torch # triton, torch
+    tokenizer_path: ${actor_rollout_ref.model.path}
+    enable_gradient_checkpointing: ${actor_rollout_ref.model.enable_gradient_checkpointing}
+    ref_type: freeze
+    fsdp_config:
+      min_num_params: 0
+      param_offload: ${actor_rollout_ref.actor.fsdp_config.param_offload}
+      optimizer_offload: ${actor_rollout_ref.actor.fsdp_config.optimizer_offload}
+    update: before # ``before`` for double-forward, ``after`` for single-forward
+    optim:
+      lr: 1e-6
+      lr_warmup_steps: -1 # Prioritized. Negative values mean delegating to lr_warmup_steps_ratio.
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      min_lr_ratio: null
+      warmup_style: null # deprecated
+      lr_scheduler_type: constant
+      total_training_steps: -1  # must be overridden by program
+      weight_decay: 0.
+      grad_clip: 10.0
+    beta_train: 0.05
+    loss_type: ce # currently only supports ce loss
+  prime_granularity: token
+  prime_norm: batch_norm # batch_norm or none. if set to none, the normalizer is beta_train
+  mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+  reward_manager: prime
+algorithm:
+  adv_estimator: rloo
+  # now supports rloo. it treats different source of reward separately.
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.000
+  reward_gt_coef: 5
+  reward_dpo_coef: 5
+trainer:
+  project_name: prime
+  experiment_name: examples
+  val_before_train: False
+  balance_batch: False

ICL/DAPO/verl-recipe/qat/config/nvfp4_w4a16.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+    "quant_method": "compressed-tensors",
+    "format": "nvfp4-pack-quantized",
+    "quantization_status": "compressed",
+    "config_groups": {
+        "group_0": {
+            "format": "nvfp4-pack-quantized",
+            "targets": [
+                "Linear"
+            ],
+            "weights": {
+                "actorder": null,
+                "block_structure": null,
+                "dynamic": false,
+                "group_size": 16,
+                "num_bits": 4,
+                "observer": "minmax",
+                "observer_kwargs": {},
+                "strategy": "tensor_group",
+                "symmetric": true,
+                "type": "float"
+            },
+            "input_activations": null,
+            "output_activations": null
+        }
+    },
+    "ignore": [
+        "lm_head"
+    ],
+    "kv_cache_scheme": null,
+    "sparsity_config": {},
+    "transform_config": {},
+    "global_compression_ratio": null
+}

ICL/DAPO/verl-recipe/qat/config/nvfp4_w4a4.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "quant_method": "compressed-tensors",
+    "format": "nvfp4-pack-quantized",
+    "quantization_status": "compressed",
+    "config_groups": {
+        "group_0": {
+            "format": "nvfp4-pack-quantized",
+            "targets": [
+                "Linear"
+            ],
+            "weights": {
+                "num_bits": 4,
+                "type": "float",
+                "symmetric": true,
+                "strategy": "tensor_group",
+                "group_size": 16,
+                "dynamic": false,
+                "observer": "minmax",
+                "observer_kwargs": {},
+                "actorder": null,
+                "block_structure": null
+            },
+            "input_activations": {
+                "num_bits": 4,
+                "type": "float",
+                "symmetric": true,
+                "strategy": "tensor_group",
+                "group_size": 16,
+                "dynamic": "local",
+                "observer": "minmax",
+                "observer_kwargs": {},
+                "actorder": null,
+                "block_structure": null
+            },
+            "output_activations": null
+        }
+    },
+    "ignore": [
+        "lm_head"
+    ],
+    "kv_cache_scheme": null,
+    "sparsity_config": {},
+    "transform_config": {},
+    "global_compression_ratio": null
+}

ICL/DAPO/verl-recipe/r1/config/evaluation.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+data:
+  path: /tmp/math_Qwen2-7B-Instruct.parquet
+  prompt_key: prompt
+  response_key: responses
+  data_source_key: data_source
+  reward_model_key: reward_model
+custom_reward_function:
+  path: null
+  name: compute_score
+ray_kwargs:
+  ray_init:
+    num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.

ICL/DAPO/verl-recipe/r1/tasks/math_reward.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+try:
+    from math_verify.metric import math_metric
+    from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig
+except ImportError:
+    print("To use Math-Verify, please install it first by running `pip install math-verify`.")
+def compute_score(model_output: str, ground_truth: str) -> bool:
+    verify_func = math_metric(
+        gold_extraction_target=(LatexExtractionConfig(),),
+        pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()),
+    )
+    ret_score = 0.0
+    # Wrap the ground truth in \boxed{} format for verification
+    ground_truth_boxed = "\\boxed{" + ground_truth + "}"
+    with contextlib.suppress(Exception):
+        ret_score, _ = verify_func([ground_truth_boxed], [model_output])
+    return ret_score

ICL/DAPO/verl-recipe/r1_ascend/figures/response_len.png ADDED Viewed

ICL/DAPO/verl-recipe/r1_ascend/figures/rewards.png ADDED Viewed

ICL/DAPO/verl-recipe/r1_ascend/figures/val_score.png ADDED Viewed

ICL/DAPO/verl-recipe/rep_exp/config/_generated_ppo_megatron_trainer.yaml ADDED Viewed

	@@ -0,0 +1,594 @@

+# This reference configration yaml is automatically generated via 'scripts/generate_trainer_config.sh'
+# in which it invokes 'python3 scripts/print_cfg.py --cfg job --config-name=ppo_megatron_trainer.yaml' to flatten the 'verl/trainer/config/ppo_megatron_trainer.yaml' config fields into a single file.
+# Do not modify this file directly.
+# The file is usually only for reference and never used.
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.McoreOptimizerConfig
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      optimizer: adam
+      lr_warmup_init: 0.0
+      lr_decay_steps: null
+      lr_decay_style: constant
+      min_lr: 0.0
+      weight_decay_incr_style: constant
+      lr_wsd_decay_style: exponential
+      lr_wsd_decay_steps: null
+      use_checkpoint_opt_param_scheduler: false
+      override_optimizer_config: {}
+    megatron:
+      _target_: verl.workers.config.McoreEngineConfig
+      param_offload: false
+      grad_offload: false
+      optimizer_offload: false
+      tensor_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      expert_tensor_parallel_size: null
+      pipeline_model_parallel_size: 1
+      virtual_pipeline_model_parallel_size: null
+      context_parallel_size: 1
+      sequence_parallel: true
+      use_distributed_optimizer: true
+      use_dist_checkpointing: false
+      dist_checkpointing_path: null
+      dist_checkpointing_prefix: ''
+      seed: 42
+      override_ddp_config: {}
+      override_transformer_config:
+        recompute_granularity: null
+        recompute_modules:
+        - core_attn
+        recompute_method: null
+        recompute_num_layers: null
+        attention_backend: flash
+      override_mcore_model_config: {}
+      use_mbridge: false
+      forward_only: false
+      dtype: bfloat16
+    _target_: verl.workers.config.McoreActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: megatron
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: null
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    entropy_coeff: 0
+    use_kl_loss: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level1
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          step_start: 0
+          step_end: null
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    data_loader_seed: 42
+    load_weight: true
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: megatron
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level1
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          step_start: 0
+          step_end: null
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    megatron:
+      _target_: verl.workers.config.McoreEngineConfig
+      param_offload: ${oc.select:actor_rollout_ref.actor.megatron.param_offload,False}
+      grad_offload: false
+      optimizer_offload: false
+      tensor_model_parallel_size: ${oc.select:actor_rollout_ref.actor.megatron.tensor_model_parallel_size,1}
+      expert_model_parallel_size: ${oc.select:actor_rollout_ref.actor.megatron.expert_model_parallel_size,1}
+      expert_tensor_parallel_size: ${oc.select:actor_rollout_ref.actor.megatron.expert_tensor_parallel_size,null}
+      pipeline_model_parallel_size: ${oc.select:actor_rollout_ref.actor.megatron.pipeline_model_parallel_size,1}
+      virtual_pipeline_model_parallel_size: ${oc.select:actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size,null}
+      context_parallel_size: ${oc.select:actor_rollout_ref.actor.megatron.context_parallel_size,1}
+      sequence_parallel: true
+      use_distributed_optimizer: true
+      use_dist_checkpointing: false
+      dist_checkpointing_path: null
+      dist_checkpointing_prefix: ''
+      seed: ${oc.select:actor_rollout_ref.actor.megatron.seed,42}
+      override_ddp_config: {}
+      override_transformer_config: ${oc.select:actor_rollout_ref.actor.megatron.override_transformer_config,{}}
+      override_mcore_model_config: {}
+      use_mbridge: ${oc.select:actor_rollout_ref.actor.megatron.use_mbridge,False}
+      forward_only: true
+      dtype: bfloat16
+    _target_: verl.workers.config.McoreActorConfig
+    load_weight: true
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 2
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 1
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    update_weights_bucket_megabytes: 512
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    layer_name_map:
+      qkv_layer_name: qkv
+      gate_proj_layer_name: gate_up
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: ~/models/deepseek-llm-7b-chat
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config:
+      model_config: {}
+      moe_config:
+        freeze_moe_router: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: ~/data/rlhf/gsm8k/train.parquet
+  val_files: ~/data/rlhf/gsm8k/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 512
+  max_response_length: 512
+  train_batch_size: 1024
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: false
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+critic:
+  optim:
+    _target_: verl.workers.config.McoreOptimizerConfig
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    optimizer: adam
+    lr_warmup_init: 0.0
+    lr_decay_steps: null
+    lr_decay_style: constant
+    min_lr: 0.0
+    weight_decay_incr_style: constant
+    lr_wsd_decay_style: exponential
+    lr_wsd_decay_steps: null
+    use_checkpoint_opt_param_scheduler: false
+    override_optimizer_config: {}
+  megatron:
+    _target_: verl.workers.config.McoreEngineConfig
+    param_offload: false
+    grad_offload: false
+    optimizer_offload: false
+    tensor_model_parallel_size: 1
+    expert_model_parallel_size: 1
+    expert_tensor_parallel_size: null
+    pipeline_model_parallel_size: 1
+    virtual_pipeline_model_parallel_size: null
+    context_parallel_size: 1
+    sequence_parallel: true
+    use_distributed_optimizer: true
+    use_dist_checkpointing: false
+    dist_checkpointing_path: null
+    dist_checkpointing_prefix: ''
+    seed: 42
+    override_ddp_config: {}
+    override_transformer_config:
+      recompute_granularity: null
+      recompute_modules:
+      - core_attn
+      recompute_method: null
+      recompute_num_layers: null
+      attention_backend: flash
+    override_mcore_model_config: {}
+    use_mbridge: false
+    forward_only: false
+    dtype: bfloat16
+  _target_: verl.workers.config.McoreCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: megatron
+  enable: null
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config:
+      model_config: {}
+      moe_config:
+        freeze_moe_router: false
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.trainer.config.BaseModelConfig
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level1
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        step_start: 0
+        step_end: null
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  nccl_timeout: 600
+  load_weight: true
+  data_loader_seed: ${oc.select:actor_rollout_ref.actor.data_loader_seed,null}
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 0
+  nnodes: 0
+  strategy: megatron
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  nccl_timeout: 600
+  megatron:
+    _target_: verl.workers.config.MegatronEngineConfig
+    param_offload: false
+    tensor_model_parallel_size: 1
+    expert_model_parallel_size: 1
+    expert_tensor_parallel_size: null
+    pipeline_model_parallel_size: 1
+    virtual_pipeline_model_parallel_size: null
+    context_parallel_size: 1
+    sequence_parallel: true
+    use_distributed_optimizer: false
+    use_dist_checkpointing: false
+    dist_checkpointing_path: null
+    dist_checkpointing_prefix: ''
+    seed: ${oc.select:actor_rollout_ref.actor.megatron.seed,42}
+    override_transformer_config: ${oc.select:actor_rollout_ref.actor.megatron.override_transformer_config,{}}
+    use_mbridge: ${oc.select:actor_rollout_ref.actor.megatron.use_mbridge,False}
+    dtype: bfloat16
+  load_weight: true
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    rollout_rs_threshold_lower: null
+    rollout_token_veto_threshold: null
+    bypass_mode: false
+    use_policy_gradient: false
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: gae
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: null
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 30
+  total_training_steps: null
+  project_name: verl_examples
+  experiment_name: gsm8k
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  nnodes: 1
+  n_gpus_per_node: 8
+  save_freq: -1
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  del_local_ckpt_after_load: false
+  val_before_train: true
+  test_freq: -1
+  critic_warmup: 0
+  default_hdfs_dir: null
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  max_actor_ckpt_to_keep: null
+  max_critic_ckpt_to_keep: null
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  rollout_data_dir: null
+  use_legacy_worker_impl: auto
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null

ICL/DAPO/verl-recipe/rep_exp/config/_generated_ppo_trainer.yaml ADDED Viewed

	@@ -0,0 +1,563 @@

+# This reference configration yaml is automatically generated via 'scripts/generate_trainer_config.sh'
+# in which it invokes 'python3 scripts/print_cfg.py --cfg job ' to flatten the 'verl/trainer/config/ppo_trainer.yaml' config fields into a single file.
+# Do not modify this file directly.
+# The file is usually only for reference and never used.
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: null
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    entropy_coeff: 0
+    use_kl_loss: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level1
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          step_start: 0
+          step_end: null
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level1
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          step_start: 0
+          step_end: null
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 2
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 1
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    update_weights_bucket_megabytes: 512
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: ~/models/deepseek-llm-7b-chat
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: ~/data/rlhf/gsm8k/train.parquet
+  val_files: ~/data/rlhf/gsm8k/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 512
+  max_response_length: 512
+  train_batch_size: 1024
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: false
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level1
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        step_start: 0
+        step_end: null
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 0
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    rollout_rs_threshold_lower: null
+    rollout_token_veto_threshold: null
+    bypass_mode: false
+    use_policy_gradient: false
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: gae
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: null
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 30
+  total_training_steps: null
+  project_name: verl_examples
+  experiment_name: gsm8k
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 8
+  save_freq: -1
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: -1
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  max_actor_ckpt_to_keep: null
+  max_critic_ckpt_to_keep: null
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null

ICL/DAPO/verl-recipe/rep_exp/config/actor/actor.yaml ADDED Viewed

	@@ -0,0 +1,215 @@

+# Format checks enforced on CI:
+# 1. Comments must appear above each field.
+# 2. There must be a blank line between each field.
+# 3. Inline comments (after a field on the same line) are not allowed.
+# 4. Indentation level is respected for nested fields.
+# Target class for this configuration
+_target_: verl.workers.config.ActorConfig
+# Number of rollouts per update (mirrors actor rollout_n)
+rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+# the abstract actor configs
+# fsdp, fsdp2 or megatron. must be set.
+strategy: ???
+# Split each sample into sub-batches of this size for PPO
+ppo_mini_batch_size: 256
+# [Deprecated] Global micro batch size
+ppo_micro_batch_size: null
+# Local per-GPU micro batch size
+ppo_micro_batch_size_per_gpu: null
+# Whether to automatically adjust batch size at runtime
+# oc.select: the default val for ref.log_prob_use_dynamic_bsz
+use_dynamic_bsz: false
+# Max tokens per GPU in one PPO batch; affects gradient accumulation
+# Typically it should be: n * ${data.max_prompt_length} + ${data.max_response_length}
+# oc.select: the default val for ref.log_prob_max_token_len_per_gpu
+ppo_max_token_len_per_gpu: 16384
+# PPO clip ratio
+clip_ratio: 0.2
+# Lower bound for asymmetric clipping (used in dual-clip PPO)
+clip_ratio_low: 0.2
+# Upper bound for asymmetric clipping (used in dual-clip PPO)
+clip_ratio_high: 0.2
+# Whether to freeze vision model, if set true, it will be freeze vision model
+freeze_vision_tower: false
+# policy loss config
+policy_loss:
+  # # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+  _target_: verl.workers.config.PolicyLossConfig
+  # Loss function mode: vanilla / clip-cov / kl-cov /gpg from https://arxiv.org/abs/2505.22617
+  loss_mode: "vanilla"
+  # Ratio of tokens to be clipped for clip-cov loss
+  clip_cov_ratio: 0.0002
+  # Lower bound for clip-cov loss
+  clip_cov_lb: 1.0
+  # Upper bound for clip-cov loss
+  clip_cov_ub: 5.0
+  # Ratio of tokens to be applied kl penalty for kl-cov loss
+  kl_cov_ratio: 0.0002
+  # KL divergence penalty coefficient
+  ppo_kl_coef: 0.1
+# Constant C in Dual-clip PPO; clips when advantage < 0 and ratio > C
+clip_ratio_c: 3.0
+# Loss aggregation mode: "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean"
+loss_agg_mode: token-mean
+# Entropy regularization coefficient in PPO loss
+entropy_coeff: 0
+# Whether to use KL loss instead of KL reward penalty. True for GRPO
+use_kl_loss: false
+# Whether to use torch.compile()
+# oc.select: the default val for ref.use_torch_compile
+use_torch_compile: true
+# KL loss coefficient when use_kl_loss is enabled. For GRPO
+kl_loss_coef: 0.001
+# Type of KL divergence loss. Options: "kl"(k1), "abs", "mse"(k2), "low_var_kl"(k3), "full"
+kl_loss_type: low_var_kl
+# Number of PPO epochs per batch
+ppo_epochs: 1
+# Shuffle training data across PPO epochs
+shuffle: false
+# checkpoint configs
+checkpoint:
+  # Target dataclass for this configuration
+  _target_: verl.trainer.config.CheckpointConfig
+  # What to include in saved checkpoints
+  # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+  save_contents: ['model', 'optimizer', 'extra']
+  # For more flexibility, you can specify the contents to load from the checkpoint.
+  # .xxx refers to the local variable xxx from the same level of hierarchy similar to python pkg
+  load_contents: ${.save_contents}
+  # Whether to save checkpoints asynchronously. Only effective for Megatron as of now.
+  async_save: False
+# optimizer configs
+optim:
+  # Learning rate
+  lr: 1e-6
+  # Warmup steps ratio (used if lr_warmup_steps is 0 or negative)
+  lr_warmup_steps_ratio: 0.0
+  # Total training steps (must be overridden at runtime)
+  total_training_steps: -1
+  # Weight decay
+  weight_decay: 0.01
+  # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio.
+  lr_warmup_steps: -1
+# Whether to use custom fused kernels (e.g., FlashAttention, fused MLP)
+use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+# profile the actor model in `update_policy`
+profiler:
+  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+  _target_: verl.utils.profiler.ProfilerConfig
+  # profiler tool, default same as profiler.tool in global config
+  # choices: nsys, npu, torch
+  tool: ${oc.select:global_profiler.tool,null}
+  # whether enable profile on Actor
+  enable: False
+  # Whether to profile all ranks.
+  all_ranks: False
+  # The ranks that will be profiled. [] or [0,1,...]
+  ranks: []
+  # profile results saving path
+  save_path: ${oc.select:global_profiler.save_path,null}
+  # specific tool config which only related to the role
+  tool_config:
+    # nsys tool config
+    nsys:
+      # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      # True for each task has its own database, False for all tasks in one training step share one database.
+      discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+    # npu config
+    npu:
+      # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+      _target_: verl.utils.profiler.config.NPUToolConfig
+      # Contents to profile, can be empty
+      # options: npu, cpu, memory, shapes, module, stack
+      contents: []
+      # Collection level, optional values: level_none, level0, level1, level2.
+      level: "level1"
+      # Whether to automatically parse the data.
+      analysis: True
+      # True for each task has its own database, False for all tasks in one training step share one database.
+      discrete: False
+    # torch profiler config
+    torch:
+      # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+      _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+      # start profile mini-batch in training
+      # NOTICE: different with global steps config which refers to iteration
+      # This field only related with mini-batch
+      step_start: 0
+      # stop profile mini-batch in training
+      step_end: null
+    # torch memory profiler config
+    torch_memory:
+      # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+      _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+      # Maximum number of memory allocation entries to track
+      trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+      # Stack trace depth for memory allocations
+      stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}

ICL/DAPO/verl-recipe/rep_exp/config/actor/dp_actor.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+# Format checks enforced on CI:
+# 1. Comments must appear above each field.
+# 2. There must be a blank line between each field.
+# 3. Inline comments (after a field on the same line) are not allowed.
+# 4. Indentation level is respected for nested fields.
+# defaults specify the default config from each component
+defaults:
+  # fsdp optimizer config
+  - ../optim@optim: fsdp
+  # fsdp engine config
+  - ../engine@fsdp_config: fsdp
+  # dp actor config, inheriting from trainer/config/actor/actor.yaml
+  - actor
+  # load the reference default config, then apply the fields in the current yaml
+  - _self_
+# Target class for this configuration
+_target_: verl.workers.config.FSDPActorConfig
+# TODO(haibin.lin): switch to fsdp2
+strategy: fsdp
+# Gradient clipping for actor updates, specific to the strategy.
+grad_clip: 1.0
+# Sequence parallelism size for Ulysses-style model parallelism
+# oc.select: the default val for ref.ulysses_sequence_parallel_size
+# [DEPRECATED] use fsdp_config.ulysses_sequence_parallel_size instead
+ulysses_sequence_parallel_size: 1
+# calculate entropy with chunking to reduce memory peak
+entropy_from_logits_with_chunking: False
+# recompute entropy
+entropy_checkpointing: False
+# Whether to remove padding tokens in inputs during training
+use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}

ICL/DAPO/verl-recipe/rep_exp/config/actor/megatron_actor.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+# megatron actor config, inheriting from trainer/config/actor/actor.yaml
+defaults:
+  # megatron optimizer config
+  - ../optim@optim: megatron
+  # megatron engine config
+  - ../engine@megatron: megatron
+  - actor
+  # load the reference default config, then apply the fields in the current yaml
+  - _self_
+_target_: verl.workers.config.McoreActorConfig
+strategy: megatron
+data_loader_seed: 42
+load_weight: True

ICL/DAPO/verl-recipe/rep_exp/config/algorithm/rollout_correction.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# Rollout Correction: corrects off-policy distribution shifts
+# See documentation: docs/algo/rollout_corr.md
+# Use presets: RolloutCorrectionConfig.decoupled_seq_is(), .pg_is(), etc.
+# IS aggregation level: null (disabled), "token" (per-token), "sequence" (per-sequence)
+rollout_is: null
+# Upper threshold for IS weight truncation (typical: 2.0-5.0)
+rollout_is_threshold: 2.0
+# RS aggregation level: null (disabled), "token", "sequence", "geometric"
+rollout_rs: null
+# Upper threshold for rejection sampling (null = use rollout_is_threshold)
+rollout_rs_threshold: null
+# Lower threshold for rejection sampling (null = auto-compute as 1/upper)
+rollout_rs_threshold_lower: null
+# Per-token veto threshold for catastrophic outliers (null = disabled)
+rollout_token_veto_threshold: null
+# Operating mode: false = Decoupled (3 policies), true = Bypass (2 policies)
+bypass_mode: false
+# Loss function: false = PPO with clipping, true = Policy gradient (no clipping)
+use_policy_gradient: false
+# Batch normalize IS weights: false = raw weights, true = normalize to mean=1.0
+rollout_is_batch_normalize: false

ICL/DAPO/verl-recipe/rep_exp/config/critic/critic.yaml ADDED Viewed

	@@ -0,0 +1,176 @@

+# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+_target_: verl.workers.config.CriticConfig
+# Number of rollouts per update (mirrors actor rollout_n)
+rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+# fsdp or fsdp2 strategy used for critic model training
+strategy: ???
+# whether to enable the critic worker.
+# by default it is only enabled if advantage estimator is gae
+# set it to True manually if you always want to enable critic worker
+enable: null
+# optimizer configs
+optim:
+  # Learning rate
+  lr: 1e-5
+  # Warmup steps ratio; total steps will be injected at runtime
+  lr_warmup_steps_ratio: 0.0
+  # Total training steps (must be overridden at runtime)
+  total_training_steps: -1
+  # Weight decay
+  weight_decay: 0.01
+  # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio.
+  lr_warmup_steps: -1
+# model config for the critic
+model:
+  # Path to pretrained model weights
+  path: ~/models/deepseek-llm-7b-chat
+  # Tokenizer path (defaults to actor's model path)
+  tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+  # Hugging Face config override
+  override_config: {}
+  # External model implementation (optional)
+  external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+  # Whether to trust remote code from Hugging Face models
+  trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+# PPO mini-batch size per update
+ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+# [Deprecated] Global micro batch size
+ppo_micro_batch_size: null
+# Local per-GPU micro batch size
+ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+# Whether to automatically adjust batch size at runtime
+use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+# Max tokens per GPU in one PPO batch (doubled for critic)
+ppo_max_token_len_per_gpu: 32768
+# Max token length per GPU in forward pass
+forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+# Number of PPO epochs per batch
+ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+# Shuffle training data across PPO epochs
+shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+# PPO value function clipping range
+cliprange_value: 0.5
+# Loss aggregation mode: "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean"
+loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+# checkpoint configs
+checkpoint:
+  # Target dataclass for this configuration
+  _target_: verl.trainer.config.CheckpointConfig
+  # What to include in saved checkpoints
+  # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+  save_contents: ['model', 'optimizer', 'extra']
+  # What to include when loading checkpoints
+  load_contents: ${.save_contents}
+  # Whether to save checkpoints asynchronously. Only effective for Megatron as of now.
+  async_save: False
+# profile the critic model in `update_critic`
+profiler:
+  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+  _target_: verl.utils.profiler.ProfilerConfig
+  # profiler tool, default same as profiler.tool in global config
+  # choices: nsys, npu, torch, torch_memory
+  tool: ${oc.select:global_profiler.tool,null}
+  # whether enable profile on Critic
+  enable: False
+  # Whether to profile all ranks.
+  all_ranks: False
+  # The ranks that will be profiled. [] or [0,1,...]
+  ranks: []
+  # profile results saving path
+  save_path: ${oc.select:global_profiler.save_path,null}
+  # specific tool config which only related to the role
+  tool_config:
+    # nsys tool config
+    nsys:
+      # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      # True for each task has its own database, False for all tasks in one training step share one database.
+      discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+    # npu config
+    npu:
+      # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+      _target_: verl.utils.profiler.config.NPUToolConfig
+      # Contents to profile, can be empty
+      # options: npu, cpu, memory, shapes, module, stack
+      contents: []
+      # Collection level, optional values: level_none, level0, level1, level2.
+      level: "level1"
+      # Whether to automatically parse the data.
+      analysis: True
+      # True for each task has its own database, False for all tasks in one training step share one database.
+      discrete: False
+    # torch profiler config
+    torch:
+      # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+      _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+      # start profile mini-batch in training
+      # NOTICE: different with global steps config which refers to iteration
+      # This field only related with mini-batch
+      step_start: 0
+      # stop profile mini-batch in training
+      step_end: null
+    # torch memory profiler config
+    torch_memory:
+      # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+      _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+      # Maximum number of memory allocation entries to track
+      trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+      # Stack trace depth for memory allocations
+      stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}

ICL/DAPO/verl-recipe/rep_exp/config/critic/dp_critic.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+# Format checks enforced on CI:
+# 1. Comments must appear above each field.
+# 2. There must be a blank line between each field.
+# 3. Inline comments (after a field on the same line) are not allowed.
+# 4. Indentation level is respected for nested fields.
+# defaults specify the default config from each component
+defaults:
+  # fsdp optimizer config
+  - ../optim@optim: fsdp
+  # fsdp engine config
+  - ../engine@model.fsdp_config: fsdp
+  # dp actor config, inheriting from trainer/config/critic/critic.yaml
+  - critic
+  # load the reference default config, then apply the fields in the current yaml
+  - _self_
+# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+_target_: verl.workers.config.FSDPCriticConfig
+# distribution strategy. Options: fsdp (deprecating), fsdp2
+strategy: fsdp
+# model config for the critic
+model:
+  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+  _target_: verl.workers.config.FSDPCriticModelCfg
+  # Whether to use shared memory for loading the model
+  use_shm: False
+  # Enable gradient checkpointing to save memory
+  enable_gradient_checkpointing: True
+  # Offload activations to CPU to reduce GPU memory usage
+  enable_activation_offload: False
+  # Use remove padding optimization (saves compute)
+  use_remove_padding: False
+  # Set to positive value to enable LoRA (e.g., 32)
+  lora_rank: 0
+  # LoRA scaling factor
+  lora_alpha: 16
+  # LoRA target modules: "all-linear" or list of linear projection layers
+  target_modules: all-linear
+# Forward-only batch size during inference (global)
+forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+# Forward-only batch size during inference (per GPU)
+forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+# Sequence parallelism size for Ulysses-style model parallelism
+# [DEPRECATED] use fsdp_config.ulysses_sequence_parallel_size instead
+ulysses_sequence_parallel_size: 1
+# Gradient clipping for critic updates
+grad_clip: 1.0

ICL/DAPO/verl-recipe/rep_exp/config/critic/megatron_critic.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+# defaults specify the default config from each component
+defaults:
+  # megatron optimizer config
+  - ../optim@optim: megatron
+  # megatron engine config
+  - ../engine@megatron: megatron
+  # dp actor config, inheriting from trainer/config/critic/critic.yaml
+  - critic
+  # load the reference default config, then apply the fields in the current yaml
+  - _self_
+# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+_target_: verl.workers.config.McoreCriticConfig
+strategy: megatron
+# seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
+nccl_timeout: 600
+# model config for the critic
+model:
+  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+  _target_: verl.trainer.config.BaseModelConfig
+  # override default empty mapping
+  override_config:
+    model_config: {}
+    moe_config:
+      freeze_moe_router: False
+# Whether to load initial weights
+load_weight: True
+# seed for data loader
+data_loader_seed: ${oc.select:actor_rollout_ref.actor.data_loader_seed,null}

ICL/DAPO/verl-recipe/rep_exp/config/data/legacy_data.yaml ADDED Viewed

	@@ -0,0 +1,131 @@

+# Tokenizer class or path. If null, it will be inferred from the model.
+tokenizer: null
+# Whether to use shared memory for data loading.
+use_shm: False
+# Training set parquet. Can be a list or a single file.
+# The program will read all files into memory, so it can't be too large (< 100GB).
+# The path can be either a local path or an HDFS path.
+# For HDFS path, we provide utils to download it to DRAM and convert it to a local path.
+train_files: ~/data/rlhf/gsm8k/train.parquet
+# Validation parquet. Can be a list or a single file.
+val_files: ~/data/rlhf/gsm8k/test.parquet
+# Maximum sample length to be used.
+# Set to -1 to use full dataset, otherwise, randomly
+# select the specified number of samples from train dataset
+train_max_samples: -1
+# Maximum sample length to be used.
+# Set to -1 to use full dataset, otherwise, randomly
+# select the specified number of samples from val dataset
+val_max_samples: -1
+# The field in the dataset where the prompt is located. Default is 'prompt'.
+prompt_key: prompt
+# The field used to select the reward function (if using different ones per example).
+reward_fn_key: data_source
+# Maximum prompt length. All prompts will be left-padded to this length.
+# An error will be reported if the length is too long.
+# oc.select: default val for rollout.prompt_length
+max_prompt_length: 512
+# Maximum response length. Rollout in RL algorithms (e.g. PPO) generates up to this length.
+# oc.select: default val for rollout.response_length
+max_response_length: 512
+# Batch size sampled for one training iteration of different RL algorithms.
+train_batch_size: 1024
+# Batch size used during validation. Can be null.
+val_batch_size: null
+# use tool config to calculate true prompt length
+tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, null}
+# Whether to return the original input_ids without adding chat template.
+# This is used when the reward model's chat template differs from the policy.
+# If using a model-based RM with different templates, this should be True.
+return_raw_input_ids: False
+# Whether to return the original chat (prompt) without applying chat template.
+return_raw_chat: True
+# Whether to return the full prompt with chat template.
+return_full_prompt: False
+# Whether to shuffle the data in the dataloader.
+shuffle: True
+# Seed to use when shuffling the data
+seed: null
+# num dataloader workers
+dataloader_num_workers: 8
+# image patch size
+image_patch_size: 14
+# Whether to shuffle the validation set.
+validation_shuffle: False
+# Whether to filter overlong prompts.
+filter_overlong_prompts: False
+# Number of workers for filtering overlong prompts.
+# For large-scale datasets, filtering can be time-consuming.
+# Use multiprocessing to speed up. Default is 1.
+filter_overlong_prompts_workers: 1
+# Truncate the input_ids or prompt if they exceed max_prompt_length.
+# Options: 'error', 'left', 'right', 'middle'. Default is 'error'.
+truncation: error
+# The field in the multi-modal dataset where the image is located. Default is 'images'.
+image_key: images
+# The field in the multi-modal dataset where the video is located.
+video_key: videos
+# If the remote tokenizer has a Python file, this flag determines whether to allow using it.
+trust_remote_code: False
+# Optional: specify a custom dataset class path and name if overriding default loading behavior.
+custom_cls:
+  # The path to the file containing your customized dataset class. If not specified, pre-implemented dataset will be used.
+  path: null
+  # The name of the dataset class within the specified file.
+  name: null
+# Whether to return multi-modal inputs in the dataset. Set to False if rollout generates new multi-modal inputs.
+return_multi_modal_inputs: True
+# settings related to data sampler
+sampler:
+  # the path to the module containing a curriculum class which implements the
+  # AbstractSampler interface
+  class_path: null
+  # the name of the curriculum class like `MySampler`
+  class_name: null
+# Data generation configuration for augmenting the dataset.
+datagen:
+  # The path to the file containing your customized data generation class.
+  # E.g. 'pkg://verl.experimental.dynamic_dataset.dynamicgen_dataset'
+  path: null
+  # The class name of the data generation class within the specified file.
+  # E.g. 'MockDataGenerator'
+  name: null
+# Additional kwargs when calling tokenizer.apply_chat_template
+apply_chat_template_kwargs: {}

ICL/DAPO/verl-recipe/rep_exp/config/engine/fsdp.yaml ADDED Viewed

	@@ -0,0 +1,56 @@

+# Target class for this configuration
+_target_: verl.workers.config.FSDPEngineConfig
+# policy for wrapping the model
+wrap_policy:
+  # Minimum number of parameters to trigger wrapping a layer with FSDP
+  min_num_params: 0
+# Whether to offload model parameters to CPU (trades speed for memory)
+# Note that this differs from the offload_policy in FSDP
+param_offload: false
+# Whether to offload optimizer state to CPU
+# Note that this differs from the offload_policy in FSDP
+optimizer_offload: false
+# Only for FSDP2: offload param/grad/optimizer during train
+offload_policy: false
+# Only for FSDP2: Reshard after forward pass to reduce memory footprint
+reshard_after_forward: true
+# Number of GPUs in each FSDP shard group; -1 means auto
+fsdp_size: -1
+# Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
+# before the current forward computation.
+forward_prefetch: False
+# model dtype of fsdp
+model_dtype: fp32
+# Whether to use original parameters in fsdp. Only avaiable in fsdp1
+use_orig_params: false
+# ulysses sequence parallel size
+ulysses_sequence_parallel_size: 1
+# Whether to use entropy_from_logits_with_chunking in fsdp.
+entropy_from_logits_with_chunking: false
+# Whether to use torch compile in fsdp.
+use_torch_compile: true
+# Whether to use entropy checkpointing in fsdp.
+entropy_checkpointing: false
+# Whether to use forward only in fsdp.
+forward_only: false
+# fsdp or fsdp2
+strategy: fsdp
+# Mixed precision training param dtype
+dtype: bfloat16 # ["bfloat16", "float16"]