neural-mesh / test /test_complete_pipeline.py
hjkim00's picture
Upload TestTime-RLVR-v2 from Full-pipeline-relative_0827 branch
f50dc54 verified
#!/usr/bin/env python3
"""
Complete TestTime RLVR Pipeline Test Script
AZR ๊ธฐ๋ฐ˜ TestTime RLVR ํŒŒ์ดํ”„๋ผ์ธ์„ ์‹ค์ œ ๋ฒค์น˜๋งˆํฌ ๋ฌธ์ œ๋กœ ํ…Œ์ŠคํŠธ
LLM ์†”๋ฃจ์…˜ ์ƒ์„ฑ โ†’ IPO ์ถ”์ถœ โ†’ ํƒœ์Šคํฌ ์ƒ์„ฑ โ†’ LLM ํ‰๊ฐ€ โ†’ Reward ๊ณ„์‚ฐ ์ „์ฒด ํ”Œ๋กœ์šฐ ๊ฒ€์ฆ
"""
import os
import sys
import torch
import argparse
import json
from pathlib import Path
from datetime import datetime
# TestTime RLVR ๋ชจ๋“ˆ ์ž„ํฌํŠธ
sys.path.append('/home/ubuntu/RLVR/TestTime-RLVR-v2')
from absolute_zero_reasoner.testtime.complete_pipeline import CompleteTestTimePipeline
from absolute_zero_reasoner.testtime.config import TestTimeConfig, BenchmarkConfig
from absolute_zero_reasoner.testtime.logger import TestTimeLogger
from absolute_zero_reasoner.testtime.solution_generator import InitialSolutionGenerator
def load_test_problem():
"""๊ฐ„๋‹จํ•œ ํ…Œ์ŠคํŠธ ๋ฌธ์ œ ์ƒ์„ฑ (HumanEval ์Šคํƒ€์ผ)"""
return {
'task_id': 'test/simple_sum',
'prompt': '''def add_two_numbers(a, b):
"""
Add two numbers and return the result.
Args:
a (int): First number
b (int): Second number
Returns:
int: Sum of a and b
Examples:
>>> add_two_numbers(2, 3)
5
>>> add_two_numbers(-1, 1)
0
>>> add_two_numbers(0, 0)
0
"""''',
'entry_point': 'add_two_numbers',
'canonical_solution': 'def add_two_numbers(a, b):\n return a + b',
'test': '''def check(candidate):
assert candidate(2, 3) == 5
assert candidate(-1, 1) == 0
assert candidate(0, 0) == 0
assert candidate(10, -5) == 5'''
}
def save_detailed_results(result, args, output_dir):
"""์ƒ์„ธํ•œ ๊ฒฐ๊ณผ๋ฅผ ๊ฐœ๋ณ„ ํŒŒ์ผ๋กœ ์ €์žฅ"""
# ๋ฒค์น˜๋งˆํฌ์™€ ๋ฌธ์ œ ID์— ๋”ฐ๋ฅธ ๋””๋ ‰ํ† ๋ฆฌ ๊ตฌ์กฐ ์ƒ์„ฑ
benchmark = result.get('benchmark', 'unknown')
problem_id = result['problem_id'] # '/' ์œ ์ง€
problem_id_safe = problem_id.replace('/', '_') # ํŒŒ์ผ๋ช…์šฉ
# {output_dir}/{benchmark}/{task_id} ๊ตฌ์กฐ๋กœ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
base_dir = os.path.join(output_dir, benchmark, problem_id_safe)
os.makedirs(base_dir, exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# 1. ์ดˆ๊ธฐ LLM ์†”๋ฃจ์…˜ ์ €์žฅ (๋ฒค์น˜๋งˆํฌ ๋ฌธ์ œ ํ•ด๊ฒฐ)
if 'llm_generation' in result['steps']:
llm_step = result['steps']['llm_generation']
initial_solution_dir = os.path.join(base_dir, 'initial_solution')
os.makedirs(initial_solution_dir, exist_ok=True)
# ๋ฒค์น˜๋งˆํฌ ๋ฌธ์ œ ์›๋ณธ ์ €์žฅ
if 'problem_loading' in result['steps']:
problem_data = result['steps']['problem_loading'].get('problem', {})
problem_file = os.path.join(initial_solution_dir, f"{problem_id_safe}_original_problem.txt")
with open(problem_file, 'w', encoding='utf-8') as f:
f.write(f"Problem ID: {result['problem_id']}\n")
f.write(f"Benchmark: {result['benchmark']}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n")
f.write("ORIGINAL BENCHMARK PROBLEM:\n")
f.write("="*80 + "\n")
f.write(problem_data.get('prompt', 'No prompt available'))
f.write("\n" + "="*80 + "\n")
f.write("ENTRY POINT:\n")
f.write("="*80 + "\n")
f.write(problem_data.get('entry_point', 'No entry point'))
f.write("\n" + "="*80 + "\n")
f.write("CANONICAL SOLUTION:\n")
f.write("="*80 + "\n")
f.write(problem_data.get('canonical_solution', 'No canonical solution'))
if 'test' in problem_data:
f.write("\n" + "="*80 + "\n")
f.write("TEST CASES:\n")
f.write("="*80 + "\n")
f.write(str(problem_data['test']))
# LLM ์ƒ์„ฑ ์†”๋ฃจ์…˜ ์ €์žฅ
llm_solution_file = os.path.join(initial_solution_dir, f"{problem_id_safe}_llm_solution.txt")
with open(llm_solution_file, 'w', encoding='utf-8') as f:
f.write(f"Problem ID: {result['problem_id']}\n")
f.write(f"Benchmark: {result['benchmark']}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n")
f.write("LLM GENERATED SOLUTION:\n")
f.write("="*80 + "\n")
f.write(llm_step.get('solution', 'No solution generated'))
f.write("\n" + "="*80 + "\n")
f.write("SYNTAX VALIDATION:\n")
f.write("="*80 + "\n")
syntax_valid = llm_step.get('syntax_valid', False)
f.write(f"Valid: {'โœ… YES' if syntax_valid else 'โŒ NO'}")
if llm_step.get('syntax_error'):
f.write(f"\nError: {llm_step['syntax_error']}")
# ์ดˆ๊ธฐ ์†”๋ฃจ์…˜ ์ •ํ™•์„ฑ ํ‰๊ฐ€ ๊ฒฐ๊ณผ ์ถ”๊ฐ€
f.write("\n" + "="*80 + "\n")
f.write("SOLUTION CORRECTNESS EVALUATION:\n")
f.write("="*80 + "\n")
solution_eval = llm_step.get('solution_evaluation')
if solution_eval:
if solution_eval['correct']:
f.write(f"Result: โœ… CORRECT ({solution_eval['passed_tests']}/{solution_eval['total_tests']} tests passed)\n")
else:
f.write(f"Result: โŒ INCORRECT ({solution_eval['passed_tests']}/{solution_eval['total_tests']} tests passed)\n")
if solution_eval.get('error'):
f.write(f"Error: {solution_eval['error']}\n")
# ์‹คํ–‰ ๊ฒฐ๊ณผ ์ƒ์„ธ ์ •๋ณด
if solution_eval.get('execution_results'):
f.write("\nExecution Details:\n")
for i, exec_result in enumerate(solution_eval['execution_results']):
f.write(f" Test {i+1}:\n")
f.write(f" Status: {exec_result.get('status', 'N/A')}\n")
if 'result' in exec_result:
f.write(f" Result: {exec_result['result'][:100]}...\n")
else:
f.write("No evaluation performed (syntax error or no test cases)\n")
# IPO ์ถ”์ถœ์„ ์œ„ํ•ด ์‚ฌ์šฉ๋œ ํ”„๋กœ๊ทธ๋žจ ์ €์žฅ
if 'ipo_extraction' in result['steps']:
ipo_step = result['steps']['ipo_extraction']
if 'extracted_program' in ipo_step:
extracted_program_file = os.path.join(initial_solution_dir, f"{problem_id_safe}_extracted_program.py")
with open(extracted_program_file, 'w', encoding='utf-8') as f:
f.write(f"# Problem ID: {result['problem_id']}\n")
f.write(f"# Benchmark: {result['benchmark']}\n")
f.write(f"# Generated: {timestamp}\n")
f.write(f"# Extracted from LLM solution for IPO generation\n\n")
f.write(ipo_step['extracted_program'])
print(f"๐Ÿ“ ์ดˆ๊ธฐ ์†”๋ฃจ์…˜ ์ €์žฅ: {initial_solution_dir}/")
# 2. IPO ํŠธ๋ฆฌํ”Œ ์ €์žฅ
if 'ipo_extraction' in result['steps']:
ipo_step = result['steps']['ipo_extraction']
triples = ipo_step.get('triples', [])
ipo_dir = os.path.join(base_dir, 'ipo_triples')
os.makedirs(ipo_dir, exist_ok=True)
for i, triple in enumerate(triples):
triple_file = os.path.join(ipo_dir, f"{problem_id_safe}_triple_{i+1}.json")
with open(triple_file, 'w', encoding='utf-8') as f:
json.dump(triple, f, indent=2, ensure_ascii=False)
print(f"๐Ÿ“ IPO ํŠธ๋ฆฌํ”Œ ์ €์žฅ: {ipo_dir}/ ({len(triples)}๊ฐœ ํŒŒ์ผ)")
# 3. ์ƒ์„ฑ๋œ ํƒœ์Šคํฌ ํ”„๋กฌํ”„ํŠธ ์ €์žฅ
if 'task_generation' in result['steps']:
task_step = result['steps']['task_generation']
all_tasks = task_step.get('all_tasks', {})
task_dir = os.path.join(base_dir, 'task_prompts')
os.makedirs(task_dir, exist_ok=True)
task_count = 0
for task_type, tasks in all_tasks.items():
for i, task in enumerate(tasks):
task_file = os.path.join(task_dir, f"{problem_id_safe}_{task_type}_{i+1}.txt")
with open(task_file, 'w', encoding='utf-8') as f:
f.write(f"Task Type: {task_type}\n")
f.write(f"Task ID: {task.get('task_id', 'N/A')}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n")
f.write("TASK PROMPT:\n")
f.write("="*80 + "\n")
f.write(task.get('prompt', 'No prompt available'))
f.write("\n" + "="*80 + "\n")
f.write("EXPECTED SOLUTION:\n")
f.write("="*80 + "\n")
f.write(task.get('expected_solution', 'No expected solution'))
f.write("\n" + "="*80 + "\n")
f.write("EVALUATION DATA:\n")
f.write("="*80 + "\n")
f.write(str(task.get('evaluation_data', 'No evaluation data')))
task_count += 1
print(f"๐Ÿ“ ํƒœ์Šคํฌ ํ”„๋กฌํ”„ํŠธ ์ €์žฅ: {task_dir}/ ({task_count}๊ฐœ ํŒŒ์ผ)")
# 4. LLM ํƒœ์Šคํฌ ์‘๋‹ต ์ €์žฅ
if 'task_evaluation' in result['steps']:
eval_step = result['steps']['task_evaluation']
evaluations = eval_step.get('evaluations', {})
response_dir = os.path.join(base_dir, 'llm_responses')
os.makedirs(response_dir, exist_ok=True)
response_count = 0
for task_type, task_evals in evaluations.items():
for i, evaluation in enumerate(task_evals):
response_file = os.path.join(response_dir, f"{problem_id_safe}_{task_type}_{i+1}_response.txt")
with open(response_file, 'w', encoding='utf-8') as f:
f.write(f"Task Type: {task_type}\n")
f.write(f"Task ID: {evaluation.get('task_id', 'N/A')}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n")
f.write("ORIGINAL PROMPT:\n")
f.write("="*80 + "\n")
f.write(evaluation.get('prompt', 'No prompt available'))
f.write("\n" + "="*80 + "\n")
f.write("LLM RESPONSE:\n")
f.write("="*80 + "\n")
f.write(evaluation.get('llm_response', 'No response'))
f.write("\n" + "="*80 + "\n")
f.write("EXPECTED SOLUTION:\n")
f.write("="*80 + "\n")
f.write(evaluation.get('expected_solution', 'No expected solution'))
# ์ถ”์ถœ๋œ ์ •๋‹ต ์ •๋ณด ์ถ”๊ฐ€ (๋ณด์ƒ ๊ณ„์‚ฐ ๊ฒฐ๊ณผ์—์„œ ๊ฐ€์ ธ์˜ค๊ธฐ)
if 'reward_computation' in result['steps']:
reward_step = result['steps']['reward_computation']
rewards = reward_step.get('rewards', {})
rewards_by_type = rewards.get('rewards_by_type', {})
# ํ˜„์žฌ ํƒœ์Šคํฌ์˜ ๋ณด์ƒ ์ •๋ณด ์ฐพ๊ธฐ
current_task_rewards = rewards_by_type.get(task_type, [])
current_reward = None
for reward in current_task_rewards:
if reward.get('task_id') == evaluation.get('task_id'):
current_reward = reward
break
if current_reward and 'extracted_answer' in current_reward:
f.write("\n" + "="*80 + "\n")
f.write("EXTRACTED ANSWER:\n")
f.write("="*80 + "\n")
f.write(current_reward['extracted_answer'])
f.write("\n" + "="*80 + "\n")
f.write("MATCH RESULT:\n")
f.write("="*80 + "\n")
match_result = "โœ… CORRECT" if current_reward.get('basic_accuracy', 0) > 0 else "โŒ INCORRECT"
f.write(f"{match_result} (Score: {current_reward.get('basic_accuracy', 0):.3f})")
response_count += 1
print(f"๐Ÿ“ LLM ์‘๋‹ต ์ €์žฅ: {response_dir}/ ({response_count}๊ฐœ ํŒŒ์ผ)")
# 4-1. ์ถ”์ถœ๋œ ์ •๋‹ต ๋ณ„๋„ ์ €์žฅ
if 'reward_computation' in result['steps']:
reward_step = result['steps']['reward_computation']
rewards = reward_step.get('rewards', {})
rewards_by_type = rewards.get('rewards_by_type', {})
extracted_dir = os.path.join(base_dir, 'extracted_answers')
os.makedirs(extracted_dir, exist_ok=True)
extracted_count = 0
for task_type, task_rewards in rewards_by_type.items():
for reward in task_rewards:
if 'extracted_answer' in reward:
task_id = reward.get('task_id', 'unknown')
extracted_file = os.path.join(extracted_dir, f"{problem_id_safe}_{task_type}_{task_id}_extracted.txt")
with open(extracted_file, 'w', encoding='utf-8') as f:
f.write(f"Task Type: {task_type}\n")
f.write(f"Task ID: {task_id}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n")
f.write("EXTRACTED ANSWER:\n")
f.write("="*80 + "\n")
f.write(reward['extracted_answer'])
f.write("\n" + "="*80 + "\n")
f.write("EXPECTED SOLUTION:\n")
f.write("="*80 + "\n")
f.write(reward['expected_solution'])
f.write("\n" + "="*80 + "\n")
f.write("MATCH RESULT:\n")
f.write("="*80 + "\n")
match_result = "โœ… CORRECT" if reward.get('basic_accuracy', 0) > 0 else "โŒ INCORRECT"
f.write(f"{match_result} (Score: {reward.get('basic_accuracy', 0):.3f})")
extracted_count += 1
print(f"๐Ÿ“ ์ถ”์ถœ๋œ ์ •๋‹ต ์ €์žฅ: {extracted_dir}/ ({extracted_count}๊ฐœ ํŒŒ์ผ)")
# 5. ์ •๋‹ต ๋น„๊ต ๋ฐ ๋ณด์ƒ ๊ฒฐ๊ณผ ์ €์žฅ
if 'reward_computation' in result['steps']:
reward_step = result['steps']['reward_computation']
rewards = reward_step.get('rewards', {})
reward_file = os.path.join(base_dir, f"{problem_id_safe}_reward_analysis.json")
with open(reward_file, 'w', encoding='utf-8') as f:
json.dump(rewards, f, indent=2, ensure_ascii=False)
# ์‚ฌ๋žŒ์ด ์ฝ๊ธฐ ์‰ฌ์šด ๋ณด์ƒ ์š”์•ฝ ์ €์žฅ
summary_file = os.path.join(base_dir, f"{problem_id_safe}_reward_summary.txt")
with open(summary_file, 'w', encoding='utf-8') as f:
f.write(f"REWARD ANALYSIS SUMMARY\n")
f.write(f"Problem: {result['problem_id']}\n")
f.write(f"Benchmark: {result['benchmark']}\n")
f.write(f"Generated: {timestamp}\n")
f.write("="*80 + "\n")
f.write(f"OVERALL STATISTICS:\n")
f.write(f"- Total Tasks: {rewards.get('total_tasks', 0)}\n")
f.write(f"- Average Reward: {rewards.get('average_reward', 0.0):.3f}\n")
f.write("\n")
f.write(f"REWARD BY TASK TYPE:\n")
for task_type, avg_reward in rewards.get('reward_distribution', {}).items():
f.write(f"- {task_type.title()}: {avg_reward:.3f}\n")
f.write("\n")
f.write(f"DETAILED TASK REWARDS:\n")
for task_type, task_rewards in rewards.get('rewards_by_type', {}).items():
f.write(f"\n{task_type.upper()} TASKS:\n")
for reward in task_rewards:
f.write(f" Task {reward['task_id']}: ")
f.write(f"Accuracy={reward['basic_accuracy']:.3f}, ")
f.write(f"Final={reward['final_reward']:.3f}\n")
print(f"๐Ÿ“ ๋ณด์ƒ ๋ถ„์„ ์ €์žฅ: {reward_file}")
print(f"๐Ÿ“ ๋ณด์ƒ ์š”์•ฝ ์ €์žฅ: {summary_file}")
# 6. ์ „์ฒด ๊ฒฐ๊ณผ ์š”์•ฝ ์ €์žฅ (JSON ์ง๋ ฌํ™” ๊ฐ€๋Šฅํ•˜๊ฒŒ ์ˆ˜์ •)
summary_file = os.path.join(base_dir, f"{problem_id_safe}_pipeline_summary.json")
# JSON ์ง๋ ฌํ™” ๊ฐ€๋Šฅํ•˜๋„๋ก ๊ฒฐ๊ณผ ์ •๋ฆฌ
serializable_result = result.copy()
# BenchmarkConfig ๊ฐ์ฒด ์ œ๊ฑฐ ๋˜๋Š” ์ง๋ ฌํ™” ๊ฐ€๋Šฅํ•œ ํ˜•ํƒœ๋กœ ๋ณ€ํ™˜
if 'steps' in serializable_result and 'problem_loading' in serializable_result['steps']:
problem_data = serializable_result['steps']['problem_loading'].get('problem', {})
if 'benchmark_config' in problem_data:
# BenchmarkConfig ๊ฐ์ฒด๋ฅผ ๋”•์…”๋„ˆ๋ฆฌ๋กœ ๋ณ€ํ™˜
config_obj = problem_data['benchmark_config']
problem_data['benchmark_config'] = {
'name': config_obj.name,
'data_path': config_obj.data_path,
'problem_prefix': config_obj.problem_prefix,
'max_problems': config_obj.max_problems,
'test_timeout': config_obj.test_timeout
}
with open(summary_file, 'w', encoding='utf-8') as f:
json.dump(serializable_result, f, indent=2, ensure_ascii=False)
print(f"๐Ÿ“ ์ „์ฒด ๊ฒฐ๊ณผ ์š”์•ฝ ์ €์žฅ: {summary_file}")
print(f"\n๐Ÿ“‚ ๋ชจ๋“  ๊ฒฐ๊ณผ ํŒŒ์ผ ์ €์žฅ ์™„๋ฃŒ: {output_dir}")
def main():
parser = argparse.ArgumentParser(description='Test Complete TestTime RLVR Pipeline')
parser.add_argument('--model', type=str, default='Qwen/Qwen2.5-7B',
help='Model name to test with')
parser.add_argument('--gpu', type=int, default=0, help='GPU ID to use')
parser.add_argument('--max_tokens', type=int, default=512, help='Max tokens for generation')
parser.add_argument('--benchmark', type=str, default='test',
choices=['test', 'humaneval', 'mbpp'],
help='Benchmark to use (test=example data, humaneval=HumanEval+, mbpp=MBPP+)')
parser.add_argument('--problem_id', type=str, default='test/simple_sum',
help='Problem ID to test (e.g., HumanEval/0, Mbpp/2)')
parser.add_argument('--output_dir', type=str, default='../tmp',
help='Output directory for detailed results')
parser.add_argument('--verbose', action='store_true', help='Verbose logging')
args = parser.parse_args()
# GPU ์„ค์ •
device = f'cuda:{args.gpu}' if torch.cuda.is_available() else 'cpu'
print(f"๐ŸŽฏ Using device: {device}")
# TestTime ์„ค์ •
config = TestTimeConfig(
model_name=args.model,
max_adaptation_steps=3,
learning_rate=1e-5,
task_distribution={'induction': 0.4, 'deduction': 0.3, 'abduction': 0.3},
adaptation_batch_size=1,
max_tasks_per_type=3,
use_flash_attention=False, # ์ž‘์€ ๋ชจ๋ธ์—์„œ๋Š” ๋น„ํ™œ์„ฑํ™”
torch_dtype=torch.float16,
enable_gradient_checkpointing=False
)
# ๋ฒค์น˜๋งˆํฌ ์„ค์ • (์ ˆ๋Œ€ ๊ฒฝ๋กœ๋กœ ๊ณ„์‚ฐ)
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if args.benchmark == 'humaneval':
benchmark_config = BenchmarkConfig.get_humaneval_config()
benchmark_config.data_path = os.path.join(base_dir, 'evaluation/code_eval/data/HumanEvalPlus.jsonl')
elif args.benchmark == 'mbpp':
benchmark_config = BenchmarkConfig.get_mbpp_config()
benchmark_config.data_path = os.path.join(base_dir, 'evaluation/code_eval/data/MbppPlus.jsonl')
else: # test
benchmark_config = BenchmarkConfig(
name='test_humaneval',
data_path='test_data',
problem_prefix='TestEval',
max_problems=1,
test_timeout=30
)
# ๋กœ๊ฑฐ ์„ค์ •
logger = TestTimeLogger(log_level='DEBUG' if args.verbose else 'INFO')
logger.log_info("๐Ÿš€ Starting Complete TestTime RLVR Pipeline Test")
logger.log_info(f"๐Ÿ“‹ Model: {args.model}")
logger.log_info(f"๐ŸŽฏ Device: {device}")
try:
# ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ (VLLM ์ตœ์ ํ™” ์ ์šฉ)
logger.log_info("๐Ÿ“ฆ Loading model and tokenizer with VLLM optimization...")
model, tokenizer = InitialSolutionGenerator.load_model_with_optimizations(
args.model, device, config, use_vllm=True # VLLM ์ตœ์ ํ™” ํ™œ์„ฑํ™”
)
logger.log_info("โœ… Model loaded successfully")
# ํŒŒ์ดํ”„๋ผ์ธ ์ดˆ๊ธฐํ™”
logger.log_info("๐Ÿ”ง Initializing pipeline...")
pipeline = CompleteTestTimePipeline(model, tokenizer, config, logger)
logger.log_info("โœ… Pipeline initialized")
# ๋ฌธ์ œ ID ์„ค์ •
problem_id = args.problem_id
logger.log_info(f"๐Ÿ“„ Testing with {args.benchmark} benchmark")
logger.log_info(f"๐Ÿ” Problem ID: {problem_id}")
# ํ…Œ์ŠคํŠธ ๋ชจ๋“œ์ธ ๊ฒฝ์šฐ ์˜ˆ์‹œ ๋ฐ์ดํ„ฐ ์‚ฌ์šฉ
if args.benchmark == 'test':
test_problem = load_test_problem()
logger.log_info(f"๐Ÿ” Problem preview: {test_problem['prompt'][:100]}...")
# ์ž„์‹œ๋กœ ๋ฌธ์ œ๋ฅผ pipeline์˜ benchmark_loader์— ์ง์ ‘ ์ œ๊ณต
pipeline.benchmark_loader.load_problem = lambda cfg, pid: test_problem
else:
# ์‹ค์ œ ๋ฒค์น˜๋งˆํฌ ์‚ฌ์šฉ ์‹œ ํ”„๋กฌํ”„ํŠธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ
temp_problem = pipeline.benchmark_loader.load_problem(benchmark_config, problem_id)
# AZR ์ฝ”๋“œ ํ‰๊ฐ€ ํ”„๋กฌํ”„ํŠธ ํฌ๋งท ์ ์šฉ
azr_prompt = f"Please provide a self-contained Python script that solves the following problem in a markdown code block:\n\n{temp_problem.get('prompt', 'No prompt available')}"
print(f"\n๐Ÿ“‹ **ORIGINAL PROBLEM:**")
print("="*80)
print(temp_problem.get('prompt', 'No prompt available'))
print("="*80)
print(f"\n๐Ÿ“‹ **AZR CODE EVALUATION PROMPT (์‹ค์ œ ์‚ฌ์šฉ๋˜๋Š” ํ”„๋กฌํ”„ํŠธ):**")
print("="*80)
print(azr_prompt)
print("="*80)
print(f"๐Ÿ“Œ Entry Point: {temp_problem.get('entry_point', 'N/A')}")
print(f"๐Ÿ“Œ Task ID: {temp_problem.get('task_id', 'N/A')}")
if 'test' in temp_problem:
print(f"๐Ÿ“Œ Test Preview: {str(temp_problem['test'])[:200]}...")
print("="*80)
# ์ „์ฒด ํŒŒ์ดํ”„๋ผ์ธ ์‹คํ–‰
logger.log_info("๐Ÿƒโ€โ™‚๏ธ Running complete pipeline...")
print("\n" + "="*60)
print("๐Ÿš€ COMPLETE TESTTIME RLVR PIPELINE EXECUTION")
print(f"๐Ÿ“‹ Benchmark: {args.benchmark}")
print(f"๐Ÿ” Problem: {problem_id}")
print("="*60)
result = pipeline.run_complete_pipeline(benchmark_config, problem_id)
print("\n" + "="*60)
print("๐Ÿ“Š PIPELINE EXECUTION RESULTS")
print("="*60)
# ๊ฒฐ๊ณผ ์ถœ๋ ฅ
print(f"โœ… Success: {result['success']}")
if result['error']:
print(f"โŒ Error: {result['error']}")
print(f"๐Ÿ“‹ Problem: {result['problem_id']}")
print(f"๐Ÿท๏ธ Benchmark: {result['benchmark']}")
# ๋‹จ๊ณ„๋ณ„ ๊ฒฐ๊ณผ ์ถœ๋ ฅ
for step_name, step_result in result['steps'].items():
print(f"\n๐Ÿ“ Step: {step_name.replace('_', ' ').title()}")
print(f" Success: {'โœ…' if step_result['success'] else 'โŒ'}")
if step_name == 'llm_generation':
solution = step_result.get('solution', '')
print(f" Solution preview: {solution[:100]}...")
print(f" Syntax valid: {'โœ…' if step_result.get('syntax_valid') else 'โŒ'}")
# ์ดˆ๊ธฐ ์†”๋ฃจ์…˜ ์ •ํ™•์„ฑ ํ‰๊ฐ€ ๊ฒฐ๊ณผ ํ‘œ์‹œ
eval_result = step_result.get('solution_evaluation')
if eval_result:
if eval_result['correct']:
print(f" โœ… Solution CORRECT ({eval_result['passed_tests']}/{eval_result['total_tests']} tests passed)")
else:
print(f" โŒ Solution INCORRECT ({eval_result['passed_tests']}/{eval_result['total_tests']} tests passed)")
if eval_result.get('error'):
print(f" Error: {eval_result['error'][:80]}...")
elif step_name == 'ipo_extraction':
print(f" IPO triples extracted: {step_result.get('num_triples', 0)}")
elif step_name == 'task_generation':
print(f" Total tasks generated: {step_result.get('total_tasks', 0)}")
for task_type, count in step_result.get('tasks_by_type', {}).items():
print(f" {task_type}: {count}")
elif step_name == 'task_evaluation':
evaluations = step_result.get('evaluations', {})
total_evaluated = sum(len(evals) for evals in evaluations.values())
print(f" Tasks evaluated: {total_evaluated}")
elif step_name == 'reward_computation':
rewards = step_result.get('rewards', {})
print(f" Average reward: {rewards.get('average_reward', 0.0):.3f}")
print(f" Total tasks scored: {rewards.get('total_tasks', 0)}")
# ์ •๋‹ต ์ถ”์ถœ ์ƒ์„ธ ์ •๋ณด ํ‘œ์‹œ
for task_type, type_rewards in rewards.get('rewards_by_type', {}).items():
print(f" {task_type.title()} Tasks:")
for reward in type_rewards[:2]: # ์ฒ˜์Œ 2๊ฐœ๋งŒ ํ‘œ์‹œ
print(f" Task {reward['task_id']}: Expected='{reward['expected_solution'][:50]}...' | Extracted='{reward['extracted_answer'][:50]}...' | Match={'โœ…' if reward['basic_accuracy'] > 0 else 'โŒ'}")
# ์ƒ์„ธ ๊ฒฐ๊ณผ ํ‘œ์‹œ (verbose ๋ชจ๋“œ)
if args.verbose and result['success']:
print("\n" + "="*60)
print("๐Ÿ” DETAILED RESULTS (VERBOSE MODE)")
print("="*60)
# IPO ์ถ”์ถœ ์ƒ์„ธ
if 'ipo_extraction' in result['steps']:
ipo_step = result['steps']['ipo_extraction']
triples = ipo_step.get('triples', [])
print(f"\n๐Ÿ“Š IPO Triples ({len(triples)}):")
for i, triple in enumerate(triples[:3]): # ์ฒ˜์Œ 3๊ฐœ๋งŒ ํ‘œ์‹œ
print(f" [{i+1}] Input: {str(triple.get('input', 'N/A'))[:50]}...")
print(f" Output: {str(triple.get('output', 'N/A'))[:50]}...")
# ํƒœ์Šคํฌ ์ƒ์„ฑ ์ƒ์„ธ
if 'task_generation' in result['steps']:
task_step = result['steps']['task_generation']
all_tasks = task_step.get('all_tasks', {})
print(f"\n๐ŸŽฏ Generated Tasks:")
for task_type, tasks in all_tasks.items():
print(f" {task_type.title()} Tasks ({len(tasks)}):")
for i, task in enumerate(tasks[:2]): # ์ฒ˜์Œ 2๊ฐœ๋งŒ ํ‘œ์‹œ
prompt = task.get('prompt', '')
print(f" [{i+1}] {prompt[:80]}...")
# ๋ณด์ƒ ๋ถ„ํฌ ์ƒ์„ธ
if 'reward_computation' in result['steps']:
reward_step = result['steps']['reward_computation']
rewards = reward_step.get('rewards', {})
distribution = rewards.get('reward_distribution', {})
print(f"\n๐Ÿ† Reward Distribution:")
for task_type, avg_reward in distribution.items():
print(f" {task_type.title()}: {avg_reward:.3f}")
print("\n" + "="*60)
print("๐ŸŽ‰ PIPELINE TEST COMPLETED SUCCESSFULLY")
print("="*60)
# ์ƒ์„ธ ๊ฒฐ๊ณผ ํŒŒ์ผ ์ €์žฅ
if result['success']:
print(f"\n๐Ÿ“ ์ƒ์„ธ ๊ฒฐ๊ณผ ํŒŒ์ผ ์ €์žฅ ์ค‘...")
save_detailed_results(result, args, args.output_dir)
return result['success']
except Exception as e:
logger.log_error(f"๐Ÿ’ฅ Pipeline test failed: {e}")
import traceback
traceback.print_exc()
return False
finally:
# GPU ๋ฉ”๋ชจ๋ฆฌ ์ •๋ฆฌ
if torch.cuda.is_available():
torch.cuda.empty_cache()
logger.log_info("๐Ÿงน Cleaned up resources")
if __name__ == '__main__':
success = main()
exit_code = 0 if success else 1
print(f"\n๐Ÿšช Exiting with code {exit_code}")
sys.exit(exit_code)