neural-mesh / evaluation /math_eval /eval_math_nodes.sh

Upload TestTime-RLVR-v2 from Full-pipeline-relative_0827 branch

f50dc54 verified 27 days ago

7.2 kB

	#!/bin/bash

	cd eval

	export NCCL_DEBUG=warn
	# 定义评估脚本路径
	set -x

	export WANDB_OFFICIAL=1
	export WANDB_API_KEY=TO_BE_FILLED
	TOTAL_NODES=${ARNOLD_WORKER_NUM:-1} # Default to 1 if not set
	CURRENT_NODE=${ARNOLD_ID:-0} # Default to 0 if not set

	add_step_0=false
	temperature=0.0
	max_tokens=16000
	top_p=0.95
	benchmarks="aime24,aime25,amc23,math500,olympiadbench,minerva_math,livemathbench"
	output_dir="eval_results"
	overwrite=false
	n_sampling=1
	specific_steps=""
	while [[ $# -gt 0 ]]; do
	case $1 in
	--run_name)
	RUN_NAME="$2"
	shift 2
	;;
	--init_model)
	INIT_MODEL_PATH="$2"
	shift 2
	;;
	--template)
	template="$2"
	shift 2
	;;
	--tp_size)
	tp_size="$2"
	shift 2
	;;
	--temperature)
	temperature="$2"
	shift 2
	;;
	--top_p)
	top_p="$2"
	shift 2
	;;
	--max_tokens)
	max_tokens="$2"
	shift 2
	;;
	--add_step_0)
	add_step_0="$2"
	shift 2
	;;
	--benchmarks)
	benchmarks="$2"
	shift 2
	;;
	--just_wandb)
	just_wandb="$2"
	shift 2
	;;
	--output_dir)
	output_dir="$2"
	shift 2
	;;
	--overwrite)
	overwrite="$2"
	shift 2
	;;
	--n_sampling)
	n_sampling="$2"
	shift 2
	;;
	--specific_steps)
	specific_steps="$2"
	shift 2
	;;
	--seed)
	seed="$2"
	shift 2
	;;
	*)
	echo "Unknown parameter: $1"
	exit 1
	;;
	esac
	done

	# Check required parameters
	if [ -z "$RUN_NAME" ] \|\| [ -z "$INIT_MODEL_PATH" ] \|\| [ -z "$template" ] \|\| [ -z "$tp_size" ]; then
	echo "Missing required parameters. Usage:"
	echo "--run_name <run_name> --init_model <init_model> --template <template> --tp_size <tp_size>"
	exit 1
	fi


	eval_script_path="sh/eval.sh"

	HDFS_HOME=../EVAL

	base_checkpoint_path="${HDFS_HOME}/checkpoints/${RUN_NAME}"

	remove_dir=$base_checkpoint_path/global_step_0/


	# init_model_path="${HDFS_HOME}/base_models/${INIT_MODEL_PATH}"
	# if [ "${INIT_MODEL_PATH}" = "Qwen2.5-32B" ]; then
	# init_model_path="/cluster/nvme4b/chenzhiqi/rl/${INIT_MODEL_PATH}"
	# elif [ "${INIT_MODEL_PATH}" = "Qwen-2.5-32B-SimpleRL-Zoo" ]; then
	# init_model_path="/cluster/nvme4b/chenzhiqi/rl/${INIT_MODEL_PATH}"
	# # elif [ "${INIT_MODEL_PATH}" = "Qwen2.5-14B" ]; then
	# # init_model_path="/cluster/data7a/chenzhiqi/rl/${INIT_MODEL_PATH}"
	# elif [ "${INIT_MODEL_PATH}" = "Qwen-2.5-14B-SimpleRL-Zoo" ]; then
	# init_model_path="/cluster/data7a/chenzhiqi/rl/${INIT_MODEL_PATH}"
	# elif [ "${INIT_MODEL_PATH}" = "AZR" ]; then
	# init_model_path="/home/fit/huangg/WORK/zqc/reason_rl/converted/code_io_full_v20001/global_step_180"

	# # if "converted" in INIT_MODEL_PATH:
	# elif [[ "${INIT_MODEL_PATH}" == converted* ]]; then
	# # Handle paths starting with "converted"
	# init_model_path="/home/fit/huangg/WORK/zqc/reason_rl/${INIT_MODEL_PATH}"

	# else
	# init_model_path="/home/fit/huangg/WORK/zqc/models/${INIT_MODEL_PATH}"
	# fi
	init_model_path=$INIT_MODEL_PATH

	chmod +x sh/convert_and_evaluate_gpu_nodes.sh


	if [ "${add_step_0:-false}" = true ]; then
	done_file="$base_checkpoint_path/global_step_0/actor/huggingface/.cp_done"

	if [ "$CURRENT_NODE" -eq 0 ]; then
	# Node 0 handles the copying
	if [ ! -f "$done_file" ]; then
	mkdir -p "$base_checkpoint_path/global_step_0/actor/huggingface"
	cp -r "$init_model_path"/* "$base_checkpoint_path/global_step_0/actor/huggingface/"
	if [ $? -eq 0 ]; then
	touch "$done_file"
	echo "Copied initial model to $base_checkpoint_path/global_step_0/actor/huggingface/"
	else
	echo "Failed to copy initial model"
	exit 1
	fi
	fi
	else
	# Other nodes wait for the .cp_done file
	echo "Node $CURRENT_NODE waiting for step 0 files to be copied..."
	while [ ! -f "$done_file" ]; do
	sleep 5
	done
	echo "Node $CURRENT_NODE detected step 0 files are ready"
	fi
	fi


	get_all_checkpoints() {
	local base_path="$1"
	local specific_steps="$2"
	local checkpoints=()

	# If specific steps are provided, only collect those checkpoints
	if [ -n "$specific_steps" ]; then
	IFS=',' read -r -a step_array <<< "$specific_steps"
	for step in "${step_array[@]}"; do
	step_dir="$base_path/global_step_$step"
	if [ -d "$step_dir" ]; then
	checkpoints+=("global_step_$step")
	else
	echo "Warning: Requested step $step does not exist at $step_dir"
	fi
	done
	else
	# Otherwise, collect all checkpoints
	for ckpt_dir in "$base_path"/global_step_*; do
	if [ -d "$ckpt_dir" ]; then
	step_tag=$(basename "$ckpt_dir")
	checkpoints+=("$step_tag")
	fi
	done
	fi

	if [ ${#checkpoints[@]} -eq 0 ]; then
	echo ""
	else
	# Sort the checkpoints to ensure consistent ordering across nodes
	printf "%s\n" "${checkpoints[@]}" \| sort -V
	fi
	}
	# Get all checkpoints

	readarray -t all_checkpoints < <(get_all_checkpoints "$base_checkpoint_path" "$specific_steps")
	total_ckpts=${#all_checkpoints[@]}

	if [ $total_ckpts -eq 0 ]; then
	echo "No checkpoints found to evaluate."
	exit 0
	fi

	echo "Total checkpoints: $total_ckpts"
	echo "Running on node $CURRENT_NODE of $TOTAL_NODES nodes"

	# Distribute checkpoints across nodes
	declare -a node_checkpoints
	for ((i=0; i<${#all_checkpoints[@]}; i++)); do
	if [ $((i % TOTAL_NODES)) -eq $CURRENT_NODE ]; then
	node_checkpoints+=("${all_checkpoints[i]}")
	fi
	done
	echo "This node will evaluate ${#node_checkpoints[@]} checkpoints:"
	printf '%s\n' "${node_checkpoints[@]}"
	# Create a temporary file with the assigned checkpoints
	tmp_ckpt_file=$(mktemp)
	printf '%s\n' "${node_checkpoints[@]}" > "$tmp_ckpt_file"

	if [ "$just_wandb" != "true" ]; then
	# # 调用转化和评估脚本
	printf "Evaluating checkpoints on node $CURRENT_NODE\n"
	bash sh/convert_and_evaluate_gpu_nodes.sh \
	"$eval_script_path" \
	"$base_checkpoint_path" \
	"$init_model_path" \
	"$template" \
	"$benchmarks" \
	"$temperature" \
	"$max_tokens" \
	"$top_p" \
	"$tp_size" \
	"$tmp_ckpt_file" \
	"$output_dir" \
	"$overwrite" \
	"$n_sampling" \
	"$seed"
	fi

	python sh/collect_results.py \
	--base_dir "$base_checkpoint_path/$output_dir" \
	--model_name $init_model_path \
	--wandb_project "verl_math_evaluate" \
	--wandb_api_key "${WANDB_API_KEY}" \
	--wandb_run_name $RUN_NAME \
	--temperature $temperature \
	--benchmarks $benchmarks \
	--use_wandb # whether to push to wandb

	cd ..
	python collect_all_math_results.py