|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
print_usage() { |
|
echo "์ฌ์ฉ๋ฒ: $0 [MODEL] [BENCHMARK] [MAX_PROBLEMS] [GPU_ID] [RESUME_OPTIONS]" |
|
echo "" |
|
echo "๋งค๊ฐ๋ณ์:" |
|
echo " MODEL ๋ชจ๋ธ ์ด๋ฆ (๊ธฐ๋ณธ๊ฐ: Qwen/Qwen2.5-7B)" |
|
echo " BENCHMARK ๋ฒค์น๋งํฌ (mbpp|humaneval|all, ๊ธฐ๋ณธ๊ฐ: mbpp)" |
|
echo " MAX_PROBLEMS ์ต๋ ๋ฌธ์ ์ (0=์ ์ฒด, ๊ธฐ๋ณธ๊ฐ: 10)" |
|
echo " GPU_ID GPU ๋ฒํธ (๊ธฐ๋ณธ๊ฐ: 6)" |
|
echo "" |
|
echo "Resume ์ต์
:" |
|
echo " --resume ์ด์ ์ ์๋ฃ๋ ๋ฌธ์ ๋ค์ ์ ์ธํ๊ณ ์ด์ด์ ์คํ" |
|
echo " --start-from PROBLEM ํน์ ๋ฌธ์ ID๋ถํฐ ์์ (์: Mbpp/100)" |
|
echo "" |
|
echo "์์:" |
|
echo " $0 # ๊ธฐ๋ณธ ์ค์ (MBPP 10๋ฌธ์ )" |
|
echo " $0 \"Qwen/Qwen2.5-7B\" mbpp 0 6 # MBPP ์ ์ฒด" |
|
echo " $0 \"Qwen/Qwen2.5-7B\" all 0 6 --resume # ๋ชจ๋ ๋ฒค์น๋งํฌ ์ ์ฒด (์ด์ด์)" |
|
echo " $0 \"Qwen/Qwen2.5-7B\" mbpp 0 6 --start-from Mbpp/100 # Mbpp/100๋ถํฐ ์์" |
|
} |
|
|
|
|
|
if [[ "$1" == "--help" ]] || [[ "$1" == "-h" ]]; then |
|
print_usage |
|
exit 0 |
|
fi |
|
|
|
|
|
|
|
|
|
|
|
export PYTHONPATH="/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/coding:$PYTHONPATH" |
|
|
|
|
|
export HF_HOME="/data/.cache/huggingface" |
|
export TRANSFORMERS_CACHE="/data/.cache/huggingface" |
|
|
|
|
|
|
|
export PYTHONPATH="/data/miniforge3/envs/azr/lib/python3.10/site-packages:$PYTHONPATH" |
|
|
|
|
|
export TOKENIZERS_PARALLELISM=false |
|
|
|
echo "๐ TestTime RLVR Batch Evaluation" |
|
echo "==================================" |
|
|
|
|
|
MODEL=${1:-"Qwen/Qwen2.5-7B"} |
|
BENCHMARK=${2:-"mbpp"} |
|
MAX_PROBLEMS=${3:-10} |
|
GPU_ID=${4:-6} |
|
|
|
|
|
RESUME_OPTIONS="" |
|
START_FROM="" |
|
|
|
|
|
shift 4 |
|
while [[ $# -gt 0 ]]; do |
|
case $1 in |
|
--resume) |
|
RESUME_OPTIONS="--resume" |
|
shift |
|
;; |
|
--start-from) |
|
if [[ -n $2 ]]; then |
|
START_FROM="--start_from $2" |
|
shift 2 |
|
else |
|
echo "โ Error: --start-from requires a problem ID" |
|
print_usage |
|
exit 1 |
|
fi |
|
;; |
|
*) |
|
echo "โ Error: Unknown option $1" |
|
print_usage |
|
exit 1 |
|
;; |
|
esac |
|
done |
|
|
|
echo "๐ Configuration:" |
|
echo " Model: $MODEL" |
|
echo " Benchmark: $BENCHMARK" |
|
echo " Max Problems: $MAX_PROBLEMS (0 = ์ ์ฒด)" |
|
echo " GPU: $GPU_ID" |
|
if [[ -n "$RESUME_OPTIONS" ]]; then |
|
echo " Resume: Enabled" |
|
fi |
|
if [[ -n "$START_FROM" ]]; then |
|
echo " Start From: $(echo $START_FROM | cut -d' ' -f2)" |
|
fi |
|
echo "" |
|
|
|
|
|
START_TIME=$(date +%s) |
|
TIMESTAMP=$(date +"%Y%m%d_%H%M%S") |
|
|
|
|
|
BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
|
LOG_DIR="${BASE_DIR}/../tmp/logs" |
|
mkdir -p "$LOG_DIR" |
|
|
|
|
|
LOG_FILE="${LOG_DIR}/batch_evaluation_${TIMESTAMP}.log" |
|
echo "๐ Full log will be saved to: $LOG_FILE" |
|
echo "" |
|
|
|
|
|
echo "==========================================================" > "$LOG_FILE" |
|
echo "TestTime RLVR Batch Evaluation Log" >> "$LOG_FILE" |
|
echo "Started at: $(date)" >> "$LOG_FILE" |
|
echo "Model: $MODEL" >> "$LOG_FILE" |
|
echo "Benchmark: $BENCHMARK" >> "$LOG_FILE" |
|
echo "Max Problems: $MAX_PROBLEMS" >> "$LOG_FILE" |
|
echo "GPU: $GPU_ID" >> "$LOG_FILE" |
|
echo "Log File: $LOG_FILE" >> "$LOG_FILE" |
|
echo "==========================================================" >> "$LOG_FILE" |
|
echo "" >> "$LOG_FILE" |
|
|
|
|
|
log_and_display() { |
|
echo "$1" | tee -a "$LOG_FILE" |
|
} |
|
|
|
|
|
run_with_log() { |
|
local cmd="$1" |
|
echo "Executing: $cmd" >> "$LOG_FILE" |
|
echo "----------------------------------------" >> "$LOG_FILE" |
|
|
|
|
|
local temp_exit_file=$(mktemp) |
|
(eval "$cmd"; echo $? > "$temp_exit_file") 2>&1 | tee -a "$LOG_FILE" |
|
local exit_code=$(cat "$temp_exit_file") |
|
rm -f "$temp_exit_file" |
|
|
|
echo "----------------------------------------" >> "$LOG_FILE" |
|
echo "Exit code: $exit_code" >> "$LOG_FILE" |
|
echo "" >> "$LOG_FILE" |
|
return $exit_code |
|
} |
|
|
|
|
|
if [[ "$BENCHMARK" == "all" ]]; then |
|
log_and_display "๐ฏ Running evaluation on ALL benchmarks (MBPP+ and HumanEval+)" |
|
log_and_display "" |
|
|
|
|
|
log_and_display "===============================================" |
|
log_and_display "๐ MBPP+ Evaluation" |
|
log_and_display "===============================================" |
|
|
|
run_with_log "cd '$BASE_DIR' && python batch_evaluate_testtime.py --model '$MODEL' --benchmark 'mbpp' --max_problems $MAX_PROBLEMS --gpu $GPU_ID --output_dir '${BASE_DIR}/../tmp/batch_results_${TIMESTAMP}' $RESUME_OPTIONS $START_FROM" |
|
|
|
MBPP_EXIT_CODE=$? |
|
|
|
if [ $MBPP_EXIT_CODE -eq 0 ]; then |
|
log_and_display "โ
MBPP+ evaluation completed" |
|
|
|
|
|
log_and_display "" |
|
log_and_display "๐ MBPP+ Results Summary:" |
|
log_and_display "===============================================" |
|
|
|
|
|
MBPP_RESULT_DIR=$(find "${BASE_DIR}/../tmp/batch_results_${TIMESTAMP}" -name "*mbpp*" -type d | head -1) |
|
if [ -n "$MBPP_RESULT_DIR" ] && [ -f "$MBPP_RESULT_DIR/evaluation_summary.md" ]; then |
|
|
|
log_and_display "๐ Results saved to: $MBPP_RESULT_DIR" |
|
|
|
|
|
MBPP_ACCURACY=$(grep "Initial Solution Accuracy" "$MBPP_RESULT_DIR/evaluation_summary.md" | head -1 | sed 's/.*: //' || echo "N/A") |
|
MBPP_TOTAL=$(grep "Total Problems" "$MBPP_RESULT_DIR/evaluation_summary.md" | sed 's/.*: //' || echo "N/A") |
|
MBPP_TIME=$(grep "Total Execution Time" "$MBPP_RESULT_DIR/evaluation_summary.md" | sed 's/.*: //' || echo "N/A") |
|
|
|
log_and_display "๐ Initial Solution Accuracy: $MBPP_ACCURACY" |
|
log_and_display "๐ Total Problems: $MBPP_TOTAL" |
|
log_and_display "โฑ๏ธ Execution Time: $MBPP_TIME" |
|
|
|
|
|
if grep -q "Problem-based Accuracy" "$MBPP_RESULT_DIR/evaluation_summary.md"; then |
|
log_and_display "๐ง Reasoning Task Performance:" |
|
grep "Problem-based Accuracy" "$MBPP_RESULT_DIR/evaluation_summary.md" | while read line; do |
|
task_name=$(echo "$line" | sed 's/.*\*\*\([^*]*\)\*\*.*/\1/') |
|
accuracy=$(echo "$line" | sed 's/.*: \([0-9.]*\).*/\1/') |
|
log_and_display " - $task_name: $accuracy" |
|
done |
|
fi |
|
else |
|
log_and_display "๐ Results directory: $MBPP_RESULT_DIR (summary not yet available)" |
|
fi |
|
|
|
else |
|
log_and_display "โ MBPP+ evaluation failed" |
|
fi |
|
|
|
log_and_display "" |
|
log_and_display "๐ Proceeding to HumanEval+ evaluation..." |
|
log_and_display "" |
|
|
|
|
|
log_and_display "===============================================" |
|
log_and_display "๐ HumanEval+ Evaluation" |
|
log_and_display "===============================================" |
|
|
|
run_with_log "cd '$BASE_DIR' && python batch_evaluate_testtime.py --model '$MODEL' --benchmark 'humaneval' --max_problems $MAX_PROBLEMS --gpu $GPU_ID --output_dir '${BASE_DIR}/../tmp/batch_results_${TIMESTAMP}' $RESUME_OPTIONS $START_FROM" |
|
|
|
HUMANEVAL_EXIT_CODE=$? |
|
|
|
if [ $HUMANEVAL_EXIT_CODE -eq 0 ]; then |
|
log_and_display "โ
HumanEval+ evaluation completed" |
|
|
|
|
|
log_and_display "" |
|
log_and_display "๐ HumanEval+ Results Summary:" |
|
log_and_display "===============================================" |
|
|
|
|
|
HUMANEVAL_RESULT_DIR=$(find "${BASE_DIR}/../tmp/batch_results_${TIMESTAMP}" -name "*humaneval*" -type d | head -1) |
|
if [ -n "$HUMANEVAL_RESULT_DIR" ] && [ -f "$HUMANEVAL_RESULT_DIR/evaluation_summary.md" ]; then |
|
|
|
log_and_display "๐ Results saved to: $HUMANEVAL_RESULT_DIR" |
|
|
|
|
|
HUMANEVAL_ACCURACY=$(grep "Initial Solution Accuracy" "$HUMANEVAL_RESULT_DIR/evaluation_summary.md" | head -1 | sed 's/.*: //' || echo "N/A") |
|
HUMANEVAL_TOTAL=$(grep "Total Problems" "$HUMANEVAL_RESULT_DIR/evaluation_summary.md" | sed 's/.*: //' || echo "N/A") |
|
HUMANEVAL_TIME=$(grep "Total Execution Time" "$HUMANEVAL_RESULT_DIR/evaluation_summary.md" | sed 's/.*: //' || echo "N/A") |
|
|
|
log_and_display "๐ Initial Solution Accuracy: $HUMANEVAL_ACCURACY" |
|
log_and_display "๐ Total Problems: $HUMANEVAL_TOTAL" |
|
log_and_display "โฑ๏ธ Execution Time: $HUMANEVAL_TIME" |
|
|
|
|
|
if grep -q "Problem-based Accuracy" "$HUMANEVAL_RESULT_DIR/evaluation_summary.md"; then |
|
log_and_display "๐ง Reasoning Task Performance:" |
|
grep "Problem-based Accuracy" "$HUMANEVAL_RESULT_DIR/evaluation_summary.md" | while read line; do |
|
task_name=$(echo "$line" | sed 's/.*\*\*\([^*]*\)\*\*.*/\1/') |
|
accuracy=$(echo "$line" | sed 's/.*: \([0-9.]*\).*/\1/') |
|
log_and_display " - $task_name: $accuracy" |
|
done |
|
fi |
|
else |
|
log_and_display "๐ Results directory: $HUMANEVAL_RESULT_DIR (summary not yet available)" |
|
fi |
|
|
|
else |
|
log_and_display "โ HumanEval+ evaluation failed" |
|
fi |
|
|
|
|
|
|
|
log_and_display "" |
|
log_and_display "๐ All Benchmarks Completed!" |
|
log_and_display "===============================================" |
|
log_and_display "๐ Final Summary:" |
|
log_and_display " MBPP+: $([ $MBPP_EXIT_CODE -eq 0 ] && echo "โ
Success" || echo "โ Failed")" |
|
log_and_display " HumanEval+: $([ $HUMANEVAL_EXIT_CODE -eq 0 ] && echo "โ
Success" || echo "โ Failed")" |
|
|
|
|
|
OUTPUT_DIR="${BASE_DIR}/../tmp/batch_results_${TIMESTAMP}" |
|
log_and_display "" |
|
log_and_display "๐ All results saved in: $OUTPUT_DIR" |
|
|
|
|
|
if [ $MBPP_EXIT_CODE -eq 0 ]; then |
|
MBPP_DIR=$(find "$OUTPUT_DIR" -name "*mbpp*" -type d | head -1) |
|
[ -n "$MBPP_DIR" ] && log_and_display " ๐ MBPP+ detailed results: $MBPP_DIR" |
|
fi |
|
|
|
if [ $HUMANEVAL_EXIT_CODE -eq 0 ]; then |
|
HUMANEVAL_DIR=$(find "$OUTPUT_DIR" -name "*humaneval*" -type d | head -1) |
|
[ -n "$HUMANEVAL_DIR" ] && log_and_display " ๐ HumanEval+ detailed results: $HUMANEVAL_DIR" |
|
fi |
|
|
|
else |
|
|
|
log_and_display "๐ฏ Running evaluation on $BENCHMARK benchmark" |
|
log_and_display "" |
|
|
|
run_with_log "cd '$BASE_DIR' && python batch_evaluate_testtime.py --model '$MODEL' --benchmark '$BENCHMARK' --max_problems $MAX_PROBLEMS --gpu $GPU_ID --output_dir '${BASE_DIR}/../tmp/batch_results' $RESUME_OPTIONS $START_FROM" |
|
|
|
|
|
PYTHON_EXIT_CODE=$? |
|
|
|
OUTPUT_DIR="${BASE_DIR}/../tmp/batch_results" |
|
fi |
|
|
|
|
|
END_TIME=$(date +%s) |
|
DURATION=$((END_TIME - START_TIME)) |
|
HOURS=$((DURATION / 3600)) |
|
MINUTES=$(((DURATION % 3600) / 60)) |
|
SECONDS=$((DURATION % 60)) |
|
|
|
log_and_display "" |
|
log_and_display "๐ Batch evaluation completed!" |
|
log_and_display "โฑ๏ธ Total duration: ${HOURS}h ${MINUTES}m ${SECONDS}s" |
|
log_and_display "๐ Check results in: $OUTPUT_DIR" |
|
log_and_display "๐ Full log saved to: $LOG_FILE" |
|
|
|
echo "" |
|
echo "๐ Summary:" |
|
echo " Results: $OUTPUT_DIR" |
|
echo " Full Log: $LOG_FILE" |
|
echo " Duration: ${HOURS}h ${MINUTES}m ${SECONDS}s" |
|
|
|
|
|
exit 0 |