neural-mesh / evaluation /math_eval /eval_math_nodes.sh
hjkim00's picture
Upload TestTime-RLVR-v2 from Full-pipeline-relative_0827 branch
f50dc54 verified
#!/bin/bash
cd eval
export NCCL_DEBUG=warn
# 定义评估脚本路径
set -x
export WANDB_OFFICIAL=1
export WANDB_API_KEY=TO_BE_FILLED
TOTAL_NODES=${ARNOLD_WORKER_NUM:-1} # Default to 1 if not set
CURRENT_NODE=${ARNOLD_ID:-0} # Default to 0 if not set
add_step_0=false
temperature=0.0
max_tokens=16000
top_p=0.95
benchmarks="aime24,aime25,amc23,math500,olympiadbench,minerva_math,livemathbench"
output_dir="eval_results"
overwrite=false
n_sampling=1
specific_steps=""
while [[ $# -gt 0 ]]; do
case $1 in
--run_name)
RUN_NAME="$2"
shift 2
;;
--init_model)
INIT_MODEL_PATH="$2"
shift 2
;;
--template)
template="$2"
shift 2
;;
--tp_size)
tp_size="$2"
shift 2
;;
--temperature)
temperature="$2"
shift 2
;;
--top_p)
top_p="$2"
shift 2
;;
--max_tokens)
max_tokens="$2"
shift 2
;;
--add_step_0)
add_step_0="$2"
shift 2
;;
--benchmarks)
benchmarks="$2"
shift 2
;;
--just_wandb)
just_wandb="$2"
shift 2
;;
--output_dir)
output_dir="$2"
shift 2
;;
--overwrite)
overwrite="$2"
shift 2
;;
--n_sampling)
n_sampling="$2"
shift 2
;;
--specific_steps)
specific_steps="$2"
shift 2
;;
--seed)
seed="$2"
shift 2
;;
*)
echo "Unknown parameter: $1"
exit 1
;;
esac
done
# Check required parameters
if [ -z "$RUN_NAME" ] || [ -z "$INIT_MODEL_PATH" ] || [ -z "$template" ] || [ -z "$tp_size" ]; then
echo "Missing required parameters. Usage:"
echo "--run_name <run_name> --init_model <init_model> --template <template> --tp_size <tp_size>"
exit 1
fi
eval_script_path="sh/eval.sh"
HDFS_HOME=../EVAL
base_checkpoint_path="${HDFS_HOME}/checkpoints/${RUN_NAME}"
remove_dir=$base_checkpoint_path/global_step_0/
# init_model_path="${HDFS_HOME}/base_models/${INIT_MODEL_PATH}"
# if [ "${INIT_MODEL_PATH}" = "Qwen2.5-32B" ]; then
# init_model_path="/cluster/nvme4b/chenzhiqi/rl/${INIT_MODEL_PATH}"
# elif [ "${INIT_MODEL_PATH}" = "Qwen-2.5-32B-SimpleRL-Zoo" ]; then
# init_model_path="/cluster/nvme4b/chenzhiqi/rl/${INIT_MODEL_PATH}"
# # elif [ "${INIT_MODEL_PATH}" = "Qwen2.5-14B" ]; then
# # init_model_path="/cluster/data7a/chenzhiqi/rl/${INIT_MODEL_PATH}"
# elif [ "${INIT_MODEL_PATH}" = "Qwen-2.5-14B-SimpleRL-Zoo" ]; then
# init_model_path="/cluster/data7a/chenzhiqi/rl/${INIT_MODEL_PATH}"
# elif [ "${INIT_MODEL_PATH}" = "AZR" ]; then
# init_model_path="/home/fit/huangg/WORK/zqc/reason_rl/converted/code_io_full_v20001/global_step_180"
# # if "converted" in INIT_MODEL_PATH:
# elif [[ "${INIT_MODEL_PATH}" == converted* ]]; then
# # Handle paths starting with "converted"
# init_model_path="/home/fit/huangg/WORK/zqc/reason_rl/${INIT_MODEL_PATH}"
# else
# init_model_path="/home/fit/huangg/WORK/zqc/models/${INIT_MODEL_PATH}"
# fi
init_model_path=$INIT_MODEL_PATH
chmod +x sh/convert_and_evaluate_gpu_nodes.sh
if [ "${add_step_0:-false}" = true ]; then
done_file="$base_checkpoint_path/global_step_0/actor/huggingface/.cp_done"
if [ "$CURRENT_NODE" -eq 0 ]; then
# Node 0 handles the copying
if [ ! -f "$done_file" ]; then
mkdir -p "$base_checkpoint_path/global_step_0/actor/huggingface"
cp -r "$init_model_path"/* "$base_checkpoint_path/global_step_0/actor/huggingface/"
if [ $? -eq 0 ]; then
touch "$done_file"
echo "Copied initial model to $base_checkpoint_path/global_step_0/actor/huggingface/"
else
echo "Failed to copy initial model"
exit 1
fi
fi
else
# Other nodes wait for the .cp_done file
echo "Node $CURRENT_NODE waiting for step 0 files to be copied..."
while [ ! -f "$done_file" ]; do
sleep 5
done
echo "Node $CURRENT_NODE detected step 0 files are ready"
fi
fi
get_all_checkpoints() {
local base_path="$1"
local specific_steps="$2"
local checkpoints=()
# If specific steps are provided, only collect those checkpoints
if [ -n "$specific_steps" ]; then
IFS=',' read -r -a step_array <<< "$specific_steps"
for step in "${step_array[@]}"; do
step_dir="$base_path/global_step_$step"
if [ -d "$step_dir" ]; then
checkpoints+=("global_step_$step")
else
echo "Warning: Requested step $step does not exist at $step_dir"
fi
done
else
# Otherwise, collect all checkpoints
for ckpt_dir in "$base_path"/global_step_*; do
if [ -d "$ckpt_dir" ]; then
step_tag=$(basename "$ckpt_dir")
checkpoints+=("$step_tag")
fi
done
fi
if [ ${#checkpoints[@]} -eq 0 ]; then
echo ""
else
# Sort the checkpoints to ensure consistent ordering across nodes
printf "%s\n" "${checkpoints[@]}" | sort -V
fi
}
# Get all checkpoints
readarray -t all_checkpoints < <(get_all_checkpoints "$base_checkpoint_path" "$specific_steps")
total_ckpts=${#all_checkpoints[@]}
if [ $total_ckpts -eq 0 ]; then
echo "No checkpoints found to evaluate."
exit 0
fi
echo "Total checkpoints: $total_ckpts"
echo "Running on node $CURRENT_NODE of $TOTAL_NODES nodes"
# Distribute checkpoints across nodes
declare -a node_checkpoints
for ((i=0; i<${#all_checkpoints[@]}; i++)); do
if [ $((i % TOTAL_NODES)) -eq $CURRENT_NODE ]; then
node_checkpoints+=("${all_checkpoints[i]}")
fi
done
echo "This node will evaluate ${#node_checkpoints[@]} checkpoints:"
printf '%s\n' "${node_checkpoints[@]}"
# Create a temporary file with the assigned checkpoints
tmp_ckpt_file=$(mktemp)
printf '%s\n' "${node_checkpoints[@]}" > "$tmp_ckpt_file"
if [ "$just_wandb" != "true" ]; then
# # 调用转化和评估脚本
printf "Evaluating checkpoints on node $CURRENT_NODE\n"
bash sh/convert_and_evaluate_gpu_nodes.sh \
"$eval_script_path" \
"$base_checkpoint_path" \
"$init_model_path" \
"$template" \
"$benchmarks" \
"$temperature" \
"$max_tokens" \
"$top_p" \
"$tp_size" \
"$tmp_ckpt_file" \
"$output_dir" \
"$overwrite" \
"$n_sampling" \
"$seed"
fi
python sh/collect_results.py \
--base_dir "$base_checkpoint_path/$output_dir" \
--model_name $init_model_path \
--wandb_project "verl_math_evaluate" \
--wandb_api_key "${WANDB_API_KEY}" \
--wandb_run_name $RUN_NAME \
--temperature $temperature \
--benchmarks $benchmarks \
--use_wandb # whether to push to wandb
cd ..
python collect_all_math_results.py