neural-mesh / test /run_batch_evaluation.sh
hjkim00's picture
Upload TestTime-RLVR-v2 from Full-pipeline-relative_0827 branch
f50dc54 verified
#!/bin/bash
# TestTime RLVR Batch Evaluation Script
# ๋ฒค์น˜๋งˆํฌ ์ „์ฒด์— ๋Œ€ํ•˜์—ฌ ํ‰๊ฐ€๋ฅผ ์‹คํ–‰ํ•ฉ๋‹ˆ๋‹ค
# ์‚ฌ์šฉ๋ฒ• ์ถœ๋ ฅ ํ•จ์ˆ˜
print_usage() {
echo "์‚ฌ์šฉ๋ฒ•: $0 [MODEL] [BENCHMARK] [MAX_PROBLEMS] [GPU_ID] [RESUME_OPTIONS]"
echo ""
echo "๋งค๊ฐœ๋ณ€์ˆ˜:"
echo " MODEL ๋ชจ๋ธ ์ด๋ฆ„ (๊ธฐ๋ณธ๊ฐ’: Qwen/Qwen2.5-7B)"
echo " BENCHMARK ๋ฒค์น˜๋งˆํฌ (mbpp|humaneval|all, ๊ธฐ๋ณธ๊ฐ’: mbpp)"
echo " MAX_PROBLEMS ์ตœ๋Œ€ ๋ฌธ์ œ ์ˆ˜ (0=์ „์ฒด, ๊ธฐ๋ณธ๊ฐ’: 10)"
echo " GPU_ID GPU ๋ฒˆํ˜ธ (๊ธฐ๋ณธ๊ฐ’: 6)"
echo ""
echo "Resume ์˜ต์…˜:"
echo " --resume ์ด์ „์— ์™„๋ฃŒ๋œ ๋ฌธ์ œ๋“ค์„ ์ œ์™ธํ•˜๊ณ  ์ด์–ด์„œ ์‹คํ–‰"
echo " --start-from PROBLEM ํŠน์ • ๋ฌธ์ œ ID๋ถ€ํ„ฐ ์‹œ์ž‘ (์˜ˆ: Mbpp/100)"
echo ""
echo "์˜ˆ์‹œ:"
echo " $0 # ๊ธฐ๋ณธ ์„ค์ • (MBPP 10๋ฌธ์ œ)"
echo " $0 \"Qwen/Qwen2.5-7B\" mbpp 0 6 # MBPP ์ „์ฒด"
echo " $0 \"Qwen/Qwen2.5-7B\" all 0 6 --resume # ๋ชจ๋“  ๋ฒค์น˜๋งˆํฌ ์ „์ฒด (์ด์–ด์„œ)"
echo " $0 \"Qwen/Qwen2.5-7B\" mbpp 0 6 --start-from Mbpp/100 # Mbpp/100๋ถ€ํ„ฐ ์‹œ์ž‘"
}
# ๋„์›€๋ง ํ™•์ธ
if [[ "$1" == "--help" ]] || [[ "$1" == "-h" ]]; then
print_usage
exit 0
fi
# GPU ์„ค์ • (์Šคํฌ๋ฆฝํŠธ์—์„œ ๋™์ ์œผ๋กœ ์„ค์ •๋จ)
# export CUDA_VISIBLE_DEVICES=6
# EvalPlus ๊ฒฝ๋กœ ์ถ”๊ฐ€
export PYTHONPATH="/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/coding:$PYTHONPATH"
# HuggingFace ์บ์‹œ ๊ฒฝ๋กœ ์„ค์ • (๋น ๋ฅธ ๋กœ๋”ฉ)
export HF_HOME="/data/.cache/huggingface"
export TRANSFORMERS_CACHE="/data/.cache/huggingface"
# EvalPlus import๋ฅผ ์œ„ํ•œ Python path ์„ค์ •
# conda ํ™˜๊ฒฝ์˜ site-packages ๊ฒฝ๋กœ ์ถ”๊ฐ€
export PYTHONPATH="/data/miniforge3/envs/azr/lib/python3.10/site-packages:$PYTHONPATH"
# Tokenizer parallelism ๊ฒฝ๊ณ  ๋ฐฉ์ง€
export TOKENIZERS_PARALLELISM=false
echo "๐Ÿš€ TestTime RLVR Batch Evaluation"
echo "=================================="
# ๊ธฐ๋ณธ ์„ค์ •
MODEL=${1:-"Qwen/Qwen2.5-7B"}
BENCHMARK=${2:-"mbpp"}
MAX_PROBLEMS=${3:-10}
GPU_ID=${4:-6}
# Resume ์˜ต์…˜ ์ฒ˜๋ฆฌ
RESUME_OPTIONS=""
START_FROM=""
# 5๋ฒˆ์งธ ๋งค๊ฐœ๋ณ€์ˆ˜๋ถ€ํ„ฐ resume ์˜ต์…˜ ์ฒ˜๋ฆฌ
shift 4 # ์ฒ˜์Œ 4๊ฐœ ๋งค๊ฐœ๋ณ€์ˆ˜ ์ œ๊ฑฐ
while [[ $# -gt 0 ]]; do
case $1 in
--resume)
RESUME_OPTIONS="--resume"
shift
;;
--start-from)
if [[ -n $2 ]]; then
START_FROM="--start_from $2"
shift 2
else
echo "โŒ Error: --start-from requires a problem ID"
print_usage
exit 1
fi
;;
*)
echo "โŒ Error: Unknown option $1"
print_usage
exit 1
;;
esac
done
echo "๐Ÿ“‹ Configuration:"
echo " Model: $MODEL"
echo " Benchmark: $BENCHMARK"
echo " Max Problems: $MAX_PROBLEMS (0 = ์ „์ฒด)"
echo " GPU: $GPU_ID"
if [[ -n "$RESUME_OPTIONS" ]]; then
echo " Resume: Enabled"
fi
if [[ -n "$START_FROM" ]]; then
echo " Start From: $(echo $START_FROM | cut -d' ' -f2)"
fi
echo ""
# ์‹œ์ž‘ ์‹œ๊ฐ„ ๊ธฐ๋ก
START_TIME=$(date +%s)
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
# ๋กœ๊ทธ ๋””๋ ‰ํ† ๋ฆฌ ์„ค์ • (์ ˆ๋Œ€ ๊ฒฝ๋กœ)
BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LOG_DIR="${BASE_DIR}/../tmp/logs"
mkdir -p "$LOG_DIR"
# ๋กœ๊ทธ ํŒŒ์ผ ์„ค์ •
LOG_FILE="${LOG_DIR}/batch_evaluation_${TIMESTAMP}.log"
echo "๐Ÿ“ Full log will be saved to: $LOG_FILE"
echo ""
# ๋กœ๊ทธ ํŒŒ์ผ์— ์‹œ์ž‘ ์ •๋ณด ๊ธฐ๋ก
echo "==========================================================" > "$LOG_FILE"
echo "TestTime RLVR Batch Evaluation Log" >> "$LOG_FILE"
echo "Started at: $(date)" >> "$LOG_FILE"
echo "Model: $MODEL" >> "$LOG_FILE"
echo "Benchmark: $BENCHMARK" >> "$LOG_FILE"
echo "Max Problems: $MAX_PROBLEMS" >> "$LOG_FILE"
echo "GPU: $GPU_ID" >> "$LOG_FILE"
echo "Log File: $LOG_FILE" >> "$LOG_FILE"
echo "==========================================================" >> "$LOG_FILE"
echo "" >> "$LOG_FILE"
# ๋กœ๊ทธ ํ•จ์ˆ˜ ์ •์˜
log_and_display() {
echo "$1" | tee -a "$LOG_FILE"
}
# Python ์‹คํ–‰ ๊ฒฐ๊ณผ๋ฅผ ๋กœ๊ทธ์— ๊ธฐ๋กํ•˜๋Š” ํ•จ์ˆ˜
run_with_log() {
local cmd="$1"
echo "Executing: $cmd" >> "$LOG_FILE"
echo "----------------------------------------" >> "$LOG_FILE"
# ์ž„์‹œ ํŒŒ์ผ์„ ์‚ฌ์šฉํ•ด exit code ๋ณด์กด
local temp_exit_file=$(mktemp)
(eval "$cmd"; echo $? > "$temp_exit_file") 2>&1 | tee -a "$LOG_FILE"
local exit_code=$(cat "$temp_exit_file")
rm -f "$temp_exit_file"
echo "----------------------------------------" >> "$LOG_FILE"
echo "Exit code: $exit_code" >> "$LOG_FILE"
echo "" >> "$LOG_FILE"
return $exit_code
}
# ์ „์ฒด ๋ฒค์น˜๋งˆํฌ ์‹คํ–‰ (all ์˜ต์…˜)
if [[ "$BENCHMARK" == "all" ]]; then
log_and_display "๐ŸŽฏ Running evaluation on ALL benchmarks (MBPP+ and HumanEval+)"
log_and_display ""
# MBPP+ ์‹คํ–‰
log_and_display "==============================================="
log_and_display "๐Ÿ“Š MBPP+ Evaluation"
log_and_display "==============================================="
run_with_log "cd '$BASE_DIR' && python batch_evaluate_testtime.py --model '$MODEL' --benchmark 'mbpp' --max_problems $MAX_PROBLEMS --gpu $GPU_ID --output_dir '${BASE_DIR}/../tmp/batch_results_${TIMESTAMP}' $RESUME_OPTIONS $START_FROM"
MBPP_EXIT_CODE=$?
if [ $MBPP_EXIT_CODE -eq 0 ]; then
log_and_display "โœ… MBPP+ evaluation completed"
# MBPP+ ๊ฒฐ๊ณผ ์š”์•ฝ ํ‘œ์‹œ
log_and_display ""
log_and_display "๐Ÿ“Š MBPP+ Results Summary:"
log_and_display "==============================================="
# ๊ฐ€์žฅ ์ตœ๊ทผ MBPP ๊ฒฐ๊ณผ ๋””๋ ‰ํ† ๋ฆฌ ์ฐพ๊ธฐ
MBPP_RESULT_DIR=$(find "${BASE_DIR}/../tmp/batch_results_${TIMESTAMP}" -name "*mbpp*" -type d | head -1)
if [ -n "$MBPP_RESULT_DIR" ] && [ -f "$MBPP_RESULT_DIR/evaluation_summary.md" ]; then
# ์š”์•ฝ ์ •๋ณด ์ถ”์ถœ ๋ฐ ํ‘œ์‹œ
log_and_display "๐Ÿ“ Results saved to: $MBPP_RESULT_DIR"
# ์ฃผ์š” ํ†ต๊ณ„ ์ถ”์ถœ
MBPP_ACCURACY=$(grep "Initial Solution Accuracy" "$MBPP_RESULT_DIR/evaluation_summary.md" | head -1 | sed 's/.*: //' || echo "N/A")
MBPP_TOTAL=$(grep "Total Problems" "$MBPP_RESULT_DIR/evaluation_summary.md" | sed 's/.*: //' || echo "N/A")
MBPP_TIME=$(grep "Total Execution Time" "$MBPP_RESULT_DIR/evaluation_summary.md" | sed 's/.*: //' || echo "N/A")
log_and_display "๐Ÿ“ˆ Initial Solution Accuracy: $MBPP_ACCURACY"
log_and_display "๐Ÿ“Š Total Problems: $MBPP_TOTAL"
log_and_display "โฑ๏ธ Execution Time: $MBPP_TIME"
# Reasoning task ์ •ํ™•๋ฅ  ํ‘œ์‹œ
if grep -q "Problem-based Accuracy" "$MBPP_RESULT_DIR/evaluation_summary.md"; then
log_and_display "๐Ÿง  Reasoning Task Performance:"
grep "Problem-based Accuracy" "$MBPP_RESULT_DIR/evaluation_summary.md" | while read line; do
task_name=$(echo "$line" | sed 's/.*\*\*\([^*]*\)\*\*.*/\1/')
accuracy=$(echo "$line" | sed 's/.*: \([0-9.]*\).*/\1/')
log_and_display " - $task_name: $accuracy"
done
fi
else
log_and_display "๐Ÿ“ Results directory: $MBPP_RESULT_DIR (summary not yet available)"
fi
else
log_and_display "โŒ MBPP+ evaluation failed"
fi
log_and_display ""
log_and_display "๐Ÿ”„ Proceeding to HumanEval+ evaluation..."
log_and_display ""
# HumanEval+ ์‹คํ–‰
log_and_display "==============================================="
log_and_display "๐Ÿ“Š HumanEval+ Evaluation"
log_and_display "==============================================="
run_with_log "cd '$BASE_DIR' && python batch_evaluate_testtime.py --model '$MODEL' --benchmark 'humaneval' --max_problems $MAX_PROBLEMS --gpu $GPU_ID --output_dir '${BASE_DIR}/../tmp/batch_results_${TIMESTAMP}' $RESUME_OPTIONS $START_FROM"
HUMANEVAL_EXIT_CODE=$?
if [ $HUMANEVAL_EXIT_CODE -eq 0 ]; then
log_and_display "โœ… HumanEval+ evaluation completed"
# HumanEval+ ๊ฒฐ๊ณผ ์š”์•ฝ ํ‘œ์‹œ
log_and_display ""
log_and_display "๐Ÿ“Š HumanEval+ Results Summary:"
log_and_display "==============================================="
# ๊ฐ€์žฅ ์ตœ๊ทผ HumanEval ๊ฒฐ๊ณผ ๋””๋ ‰ํ† ๋ฆฌ ์ฐพ๊ธฐ
HUMANEVAL_RESULT_DIR=$(find "${BASE_DIR}/../tmp/batch_results_${TIMESTAMP}" -name "*humaneval*" -type d | head -1)
if [ -n "$HUMANEVAL_RESULT_DIR" ] && [ -f "$HUMANEVAL_RESULT_DIR/evaluation_summary.md" ]; then
# ์š”์•ฝ ์ •๋ณด ์ถ”์ถœ ๋ฐ ํ‘œ์‹œ
log_and_display "๐Ÿ“ Results saved to: $HUMANEVAL_RESULT_DIR"
# ์ฃผ์š” ํ†ต๊ณ„ ์ถ”์ถœ
HUMANEVAL_ACCURACY=$(grep "Initial Solution Accuracy" "$HUMANEVAL_RESULT_DIR/evaluation_summary.md" | head -1 | sed 's/.*: //' || echo "N/A")
HUMANEVAL_TOTAL=$(grep "Total Problems" "$HUMANEVAL_RESULT_DIR/evaluation_summary.md" | sed 's/.*: //' || echo "N/A")
HUMANEVAL_TIME=$(grep "Total Execution Time" "$HUMANEVAL_RESULT_DIR/evaluation_summary.md" | sed 's/.*: //' || echo "N/A")
log_and_display "๐Ÿ“ˆ Initial Solution Accuracy: $HUMANEVAL_ACCURACY"
log_and_display "๐Ÿ“Š Total Problems: $HUMANEVAL_TOTAL"
log_and_display "โฑ๏ธ Execution Time: $HUMANEVAL_TIME"
# Reasoning task ์ •ํ™•๋ฅ  ํ‘œ์‹œ
if grep -q "Problem-based Accuracy" "$HUMANEVAL_RESULT_DIR/evaluation_summary.md"; then
log_and_display "๐Ÿง  Reasoning Task Performance:"
grep "Problem-based Accuracy" "$HUMANEVAL_RESULT_DIR/evaluation_summary.md" | while read line; do
task_name=$(echo "$line" | sed 's/.*\*\*\([^*]*\)\*\*.*/\1/')
accuracy=$(echo "$line" | sed 's/.*: \([0-9.]*\).*/\1/')
log_and_display " - $task_name: $accuracy"
done
fi
else
log_and_display "๐Ÿ“ Results directory: $HUMANEVAL_RESULT_DIR (summary not yet available)"
fi
else
log_and_display "โŒ HumanEval+ evaluation failed"
fi
# ๊ฒฐ๊ณผ ์š”์•ฝ
# ์ตœ์ข… ์ข…ํ•ฉ ์š”์•ฝ
log_and_display ""
log_and_display "๐ŸŽ‰ All Benchmarks Completed!"
log_and_display "==============================================="
log_and_display "๐Ÿ“Š Final Summary:"
log_and_display " MBPP+: $([ $MBPP_EXIT_CODE -eq 0 ] && echo "โœ… Success" || echo "โŒ Failed")"
log_and_display " HumanEval+: $([ $HUMANEVAL_EXIT_CODE -eq 0 ] && echo "โœ… Success" || echo "โŒ Failed")"
# ํ†ตํ•ฉ ๊ฒฐ๊ณผ ๋””๋ ‰ํ† ๋ฆฌ ์ •๋ณด
OUTPUT_DIR="${BASE_DIR}/../tmp/batch_results_${TIMESTAMP}"
log_and_display ""
log_and_display "๐Ÿ“ All results saved in: $OUTPUT_DIR"
# ๊ฐ ๋ฒค์น˜๋งˆํฌ๋ณ„ ๋””๋ ‰ํ† ๋ฆฌ ํ‘œ์‹œ
if [ $MBPP_EXIT_CODE -eq 0 ]; then
MBPP_DIR=$(find "$OUTPUT_DIR" -name "*mbpp*" -type d | head -1)
[ -n "$MBPP_DIR" ] && log_and_display " ๐Ÿ“‚ MBPP+ detailed results: $MBPP_DIR"
fi
if [ $HUMANEVAL_EXIT_CODE -eq 0 ]; then
HUMANEVAL_DIR=$(find "$OUTPUT_DIR" -name "*humaneval*" -type d | head -1)
[ -n "$HUMANEVAL_DIR" ] && log_and_display " ๐Ÿ“‚ HumanEval+ detailed results: $HUMANEVAL_DIR"
fi
else
# ๋‹จ์ผ ๋ฒค์น˜๋งˆํฌ ์‹คํ–‰
log_and_display "๐ŸŽฏ Running evaluation on $BENCHMARK benchmark"
log_and_display ""
run_with_log "cd '$BASE_DIR' && python batch_evaluate_testtime.py --model '$MODEL' --benchmark '$BENCHMARK' --max_problems $MAX_PROBLEMS --gpu $GPU_ID --output_dir '${BASE_DIR}/../tmp/batch_results' $RESUME_OPTIONS $START_FROM"
# Python ์‹คํ–‰์ด ์™„๋ฃŒ๋˜๋ฉด ๋ช…์‹œ์ ์œผ๋กœ exit code ๋ฐ˜ํ™˜
PYTHON_EXIT_CODE=$?
OUTPUT_DIR="${BASE_DIR}/../tmp/batch_results"
fi
# ์‹คํ–‰ ์‹œ๊ฐ„ ๊ณ„์‚ฐ
END_TIME=$(date +%s)
DURATION=$((END_TIME - START_TIME))
HOURS=$((DURATION / 3600))
MINUTES=$(((DURATION % 3600) / 60))
SECONDS=$((DURATION % 60))
log_and_display ""
log_and_display "๐ŸŽ‰ Batch evaluation completed!"
log_and_display "โฑ๏ธ Total duration: ${HOURS}h ${MINUTES}m ${SECONDS}s"
log_and_display "๐Ÿ“ Check results in: $OUTPUT_DIR"
log_and_display "๐Ÿ“ Full log saved to: $LOG_FILE"
echo ""
echo "๐Ÿ“‹ Summary:"
echo " Results: $OUTPUT_DIR"
echo " Full Log: $LOG_FILE"
echo " Duration: ${HOURS}h ${MINUTES}m ${SECONDS}s"
# ๋ช…์‹œ์ ์œผ๋กœ ์Šคํฌ๋ฆฝํŠธ ์ข…๋ฃŒ
exit 0