|
|
|
""" |
|
Complete TestTime RLVR Pipeline Test Script |
|
|
|
AZR ๊ธฐ๋ฐ TestTime RLVR ํ์ดํ๋ผ์ธ์ ์ค์ ๋ฒค์น๋งํฌ ๋ฌธ์ ๋ก ํ
์คํธ |
|
LLM ์๋ฃจ์
์์ฑ โ IPO ์ถ์ถ โ ํ์คํฌ ์์ฑ โ LLM ํ๊ฐ โ Reward ๊ณ์ฐ ์ ์ฒด ํ๋ก์ฐ ๊ฒ์ฆ |
|
""" |
|
|
|
import os |
|
import sys |
|
import torch |
|
import argparse |
|
import json |
|
from pathlib import Path |
|
from datetime import datetime |
|
|
|
|
|
sys.path.append('/home/ubuntu/RLVR/TestTime-RLVR-v2') |
|
from absolute_zero_reasoner.testtime.complete_pipeline import CompleteTestTimePipeline |
|
from absolute_zero_reasoner.testtime.config import TestTimeConfig, BenchmarkConfig |
|
from absolute_zero_reasoner.testtime.logger import TestTimeLogger |
|
from absolute_zero_reasoner.testtime.solution_generator import InitialSolutionGenerator |
|
|
|
|
|
def load_test_problem(): |
|
"""๊ฐ๋จํ ํ
์คํธ ๋ฌธ์ ์์ฑ (HumanEval ์คํ์ผ)""" |
|
return { |
|
'task_id': 'test/simple_sum', |
|
'prompt': '''def add_two_numbers(a, b): |
|
""" |
|
Add two numbers and return the result. |
|
|
|
Args: |
|
a (int): First number |
|
b (int): Second number |
|
|
|
Returns: |
|
int: Sum of a and b |
|
|
|
Examples: |
|
>>> add_two_numbers(2, 3) |
|
5 |
|
>>> add_two_numbers(-1, 1) |
|
0 |
|
>>> add_two_numbers(0, 0) |
|
0 |
|
"""''', |
|
'entry_point': 'add_two_numbers', |
|
'canonical_solution': 'def add_two_numbers(a, b):\n return a + b', |
|
'test': '''def check(candidate): |
|
assert candidate(2, 3) == 5 |
|
assert candidate(-1, 1) == 0 |
|
assert candidate(0, 0) == 0 |
|
assert candidate(10, -5) == 5''' |
|
} |
|
|
|
|
|
def save_detailed_results(result, args, output_dir): |
|
"""์์ธํ ๊ฒฐ๊ณผ๋ฅผ ๊ฐ๋ณ ํ์ผ๋ก ์ ์ฅ""" |
|
|
|
|
|
benchmark = result.get('benchmark', 'unknown') |
|
problem_id = result['problem_id'] |
|
problem_id_safe = problem_id.replace('/', '_') |
|
|
|
|
|
base_dir = os.path.join(output_dir, benchmark, problem_id_safe) |
|
os.makedirs(base_dir, exist_ok=True) |
|
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') |
|
|
|
|
|
if 'llm_generation' in result['steps']: |
|
llm_step = result['steps']['llm_generation'] |
|
|
|
initial_solution_dir = os.path.join(base_dir, 'initial_solution') |
|
os.makedirs(initial_solution_dir, exist_ok=True) |
|
|
|
|
|
if 'problem_loading' in result['steps']: |
|
problem_data = result['steps']['problem_loading'].get('problem', {}) |
|
problem_file = os.path.join(initial_solution_dir, f"{problem_id_safe}_original_problem.txt") |
|
with open(problem_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Problem ID: {result['problem_id']}\n") |
|
f.write(f"Benchmark: {result['benchmark']}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n") |
|
f.write("ORIGINAL BENCHMARK PROBLEM:\n") |
|
f.write("="*80 + "\n") |
|
f.write(problem_data.get('prompt', 'No prompt available')) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("ENTRY POINT:\n") |
|
f.write("="*80 + "\n") |
|
f.write(problem_data.get('entry_point', 'No entry point')) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("CANONICAL SOLUTION:\n") |
|
f.write("="*80 + "\n") |
|
f.write(problem_data.get('canonical_solution', 'No canonical solution')) |
|
if 'test' in problem_data: |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("TEST CASES:\n") |
|
f.write("="*80 + "\n") |
|
f.write(str(problem_data['test'])) |
|
|
|
|
|
llm_solution_file = os.path.join(initial_solution_dir, f"{problem_id_safe}_llm_solution.txt") |
|
with open(llm_solution_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Problem ID: {result['problem_id']}\n") |
|
f.write(f"Benchmark: {result['benchmark']}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n") |
|
f.write("LLM GENERATED SOLUTION:\n") |
|
f.write("="*80 + "\n") |
|
f.write(llm_step.get('solution', 'No solution generated')) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("SYNTAX VALIDATION:\n") |
|
f.write("="*80 + "\n") |
|
syntax_valid = llm_step.get('syntax_valid', False) |
|
f.write(f"Valid: {'โ
YES' if syntax_valid else 'โ NO'}") |
|
if llm_step.get('syntax_error'): |
|
f.write(f"\nError: {llm_step['syntax_error']}") |
|
|
|
|
|
f.write("\n" + "="*80 + "\n") |
|
f.write("SOLUTION CORRECTNESS EVALUATION:\n") |
|
f.write("="*80 + "\n") |
|
|
|
solution_eval = llm_step.get('solution_evaluation') |
|
if solution_eval: |
|
if solution_eval['correct']: |
|
f.write(f"Result: โ
CORRECT ({solution_eval['passed_tests']}/{solution_eval['total_tests']} tests passed)\n") |
|
else: |
|
f.write(f"Result: โ INCORRECT ({solution_eval['passed_tests']}/{solution_eval['total_tests']} tests passed)\n") |
|
|
|
if solution_eval.get('error'): |
|
f.write(f"Error: {solution_eval['error']}\n") |
|
|
|
|
|
if solution_eval.get('execution_results'): |
|
f.write("\nExecution Details:\n") |
|
for i, exec_result in enumerate(solution_eval['execution_results']): |
|
f.write(f" Test {i+1}:\n") |
|
f.write(f" Status: {exec_result.get('status', 'N/A')}\n") |
|
if 'result' in exec_result: |
|
f.write(f" Result: {exec_result['result'][:100]}...\n") |
|
else: |
|
f.write("No evaluation performed (syntax error or no test cases)\n") |
|
|
|
|
|
if 'ipo_extraction' in result['steps']: |
|
ipo_step = result['steps']['ipo_extraction'] |
|
if 'extracted_program' in ipo_step: |
|
extracted_program_file = os.path.join(initial_solution_dir, f"{problem_id_safe}_extracted_program.py") |
|
with open(extracted_program_file, 'w', encoding='utf-8') as f: |
|
f.write(f"# Problem ID: {result['problem_id']}\n") |
|
f.write(f"# Benchmark: {result['benchmark']}\n") |
|
f.write(f"# Generated: {timestamp}\n") |
|
f.write(f"# Extracted from LLM solution for IPO generation\n\n") |
|
f.write(ipo_step['extracted_program']) |
|
|
|
print(f"๐ ์ด๊ธฐ ์๋ฃจ์
์ ์ฅ: {initial_solution_dir}/") |
|
|
|
|
|
if 'ipo_extraction' in result['steps']: |
|
ipo_step = result['steps']['ipo_extraction'] |
|
triples = ipo_step.get('triples', []) |
|
|
|
ipo_dir = os.path.join(base_dir, 'ipo_triples') |
|
os.makedirs(ipo_dir, exist_ok=True) |
|
|
|
for i, triple in enumerate(triples): |
|
triple_file = os.path.join(ipo_dir, f"{problem_id_safe}_triple_{i+1}.json") |
|
with open(triple_file, 'w', encoding='utf-8') as f: |
|
json.dump(triple, f, indent=2, ensure_ascii=False) |
|
|
|
print(f"๐ IPO ํธ๋ฆฌํ ์ ์ฅ: {ipo_dir}/ ({len(triples)}๊ฐ ํ์ผ)") |
|
|
|
|
|
if 'task_generation' in result['steps']: |
|
task_step = result['steps']['task_generation'] |
|
all_tasks = task_step.get('all_tasks', {}) |
|
|
|
task_dir = os.path.join(base_dir, 'task_prompts') |
|
os.makedirs(task_dir, exist_ok=True) |
|
|
|
task_count = 0 |
|
for task_type, tasks in all_tasks.items(): |
|
for i, task in enumerate(tasks): |
|
task_file = os.path.join(task_dir, f"{problem_id_safe}_{task_type}_{i+1}.txt") |
|
with open(task_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Task Type: {task_type}\n") |
|
f.write(f"Task ID: {task.get('task_id', 'N/A')}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n") |
|
f.write("TASK PROMPT:\n") |
|
f.write("="*80 + "\n") |
|
f.write(task.get('prompt', 'No prompt available')) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("EXPECTED SOLUTION:\n") |
|
f.write("="*80 + "\n") |
|
f.write(task.get('expected_solution', 'No expected solution')) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("EVALUATION DATA:\n") |
|
f.write("="*80 + "\n") |
|
f.write(str(task.get('evaluation_data', 'No evaluation data'))) |
|
task_count += 1 |
|
|
|
print(f"๐ ํ์คํฌ ํ๋กฌํํธ ์ ์ฅ: {task_dir}/ ({task_count}๊ฐ ํ์ผ)") |
|
|
|
|
|
if 'task_evaluation' in result['steps']: |
|
eval_step = result['steps']['task_evaluation'] |
|
evaluations = eval_step.get('evaluations', {}) |
|
|
|
response_dir = os.path.join(base_dir, 'llm_responses') |
|
os.makedirs(response_dir, exist_ok=True) |
|
|
|
response_count = 0 |
|
for task_type, task_evals in evaluations.items(): |
|
for i, evaluation in enumerate(task_evals): |
|
response_file = os.path.join(response_dir, f"{problem_id_safe}_{task_type}_{i+1}_response.txt") |
|
with open(response_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Task Type: {task_type}\n") |
|
f.write(f"Task ID: {evaluation.get('task_id', 'N/A')}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n") |
|
f.write("ORIGINAL PROMPT:\n") |
|
f.write("="*80 + "\n") |
|
f.write(evaluation.get('prompt', 'No prompt available')) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("LLM RESPONSE:\n") |
|
f.write("="*80 + "\n") |
|
f.write(evaluation.get('llm_response', 'No response')) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("EXPECTED SOLUTION:\n") |
|
f.write("="*80 + "\n") |
|
f.write(evaluation.get('expected_solution', 'No expected solution')) |
|
|
|
|
|
if 'reward_computation' in result['steps']: |
|
reward_step = result['steps']['reward_computation'] |
|
rewards = reward_step.get('rewards', {}) |
|
rewards_by_type = rewards.get('rewards_by_type', {}) |
|
|
|
|
|
current_task_rewards = rewards_by_type.get(task_type, []) |
|
current_reward = None |
|
for reward in current_task_rewards: |
|
if reward.get('task_id') == evaluation.get('task_id'): |
|
current_reward = reward |
|
break |
|
|
|
if current_reward and 'extracted_answer' in current_reward: |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("EXTRACTED ANSWER:\n") |
|
f.write("="*80 + "\n") |
|
f.write(current_reward['extracted_answer']) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("MATCH RESULT:\n") |
|
f.write("="*80 + "\n") |
|
match_result = "โ
CORRECT" if current_reward.get('basic_accuracy', 0) > 0 else "โ INCORRECT" |
|
f.write(f"{match_result} (Score: {current_reward.get('basic_accuracy', 0):.3f})") |
|
|
|
response_count += 1 |
|
|
|
print(f"๐ LLM ์๋ต ์ ์ฅ: {response_dir}/ ({response_count}๊ฐ ํ์ผ)") |
|
|
|
|
|
if 'reward_computation' in result['steps']: |
|
reward_step = result['steps']['reward_computation'] |
|
rewards = reward_step.get('rewards', {}) |
|
rewards_by_type = rewards.get('rewards_by_type', {}) |
|
|
|
extracted_dir = os.path.join(base_dir, 'extracted_answers') |
|
os.makedirs(extracted_dir, exist_ok=True) |
|
|
|
extracted_count = 0 |
|
for task_type, task_rewards in rewards_by_type.items(): |
|
for reward in task_rewards: |
|
if 'extracted_answer' in reward: |
|
task_id = reward.get('task_id', 'unknown') |
|
extracted_file = os.path.join(extracted_dir, f"{problem_id_safe}_{task_type}_{task_id}_extracted.txt") |
|
with open(extracted_file, 'w', encoding='utf-8') as f: |
|
f.write(f"Task Type: {task_type}\n") |
|
f.write(f"Task ID: {task_id}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n") |
|
f.write("EXTRACTED ANSWER:\n") |
|
f.write("="*80 + "\n") |
|
f.write(reward['extracted_answer']) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("EXPECTED SOLUTION:\n") |
|
f.write("="*80 + "\n") |
|
f.write(reward['expected_solution']) |
|
f.write("\n" + "="*80 + "\n") |
|
f.write("MATCH RESULT:\n") |
|
f.write("="*80 + "\n") |
|
match_result = "โ
CORRECT" if reward.get('basic_accuracy', 0) > 0 else "โ INCORRECT" |
|
f.write(f"{match_result} (Score: {reward.get('basic_accuracy', 0):.3f})") |
|
extracted_count += 1 |
|
|
|
print(f"๐ ์ถ์ถ๋ ์ ๋ต ์ ์ฅ: {extracted_dir}/ ({extracted_count}๊ฐ ํ์ผ)") |
|
|
|
|
|
if 'reward_computation' in result['steps']: |
|
reward_step = result['steps']['reward_computation'] |
|
rewards = reward_step.get('rewards', {}) |
|
|
|
reward_file = os.path.join(base_dir, f"{problem_id_safe}_reward_analysis.json") |
|
with open(reward_file, 'w', encoding='utf-8') as f: |
|
json.dump(rewards, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
summary_file = os.path.join(base_dir, f"{problem_id_safe}_reward_summary.txt") |
|
with open(summary_file, 'w', encoding='utf-8') as f: |
|
f.write(f"REWARD ANALYSIS SUMMARY\n") |
|
f.write(f"Problem: {result['problem_id']}\n") |
|
f.write(f"Benchmark: {result['benchmark']}\n") |
|
f.write(f"Generated: {timestamp}\n") |
|
f.write("="*80 + "\n") |
|
|
|
f.write(f"OVERALL STATISTICS:\n") |
|
f.write(f"- Total Tasks: {rewards.get('total_tasks', 0)}\n") |
|
f.write(f"- Average Reward: {rewards.get('average_reward', 0.0):.3f}\n") |
|
f.write("\n") |
|
|
|
f.write(f"REWARD BY TASK TYPE:\n") |
|
for task_type, avg_reward in rewards.get('reward_distribution', {}).items(): |
|
f.write(f"- {task_type.title()}: {avg_reward:.3f}\n") |
|
f.write("\n") |
|
|
|
f.write(f"DETAILED TASK REWARDS:\n") |
|
for task_type, task_rewards in rewards.get('rewards_by_type', {}).items(): |
|
f.write(f"\n{task_type.upper()} TASKS:\n") |
|
for reward in task_rewards: |
|
f.write(f" Task {reward['task_id']}: ") |
|
f.write(f"Accuracy={reward['basic_accuracy']:.3f}, ") |
|
f.write(f"Final={reward['final_reward']:.3f}\n") |
|
|
|
print(f"๐ ๋ณด์ ๋ถ์ ์ ์ฅ: {reward_file}") |
|
print(f"๐ ๋ณด์ ์์ฝ ์ ์ฅ: {summary_file}") |
|
|
|
|
|
summary_file = os.path.join(base_dir, f"{problem_id_safe}_pipeline_summary.json") |
|
|
|
|
|
serializable_result = result.copy() |
|
|
|
|
|
if 'steps' in serializable_result and 'problem_loading' in serializable_result['steps']: |
|
problem_data = serializable_result['steps']['problem_loading'].get('problem', {}) |
|
if 'benchmark_config' in problem_data: |
|
|
|
config_obj = problem_data['benchmark_config'] |
|
problem_data['benchmark_config'] = { |
|
'name': config_obj.name, |
|
'data_path': config_obj.data_path, |
|
'problem_prefix': config_obj.problem_prefix, |
|
'max_problems': config_obj.max_problems, |
|
'test_timeout': config_obj.test_timeout |
|
} |
|
|
|
with open(summary_file, 'w', encoding='utf-8') as f: |
|
json.dump(serializable_result, f, indent=2, ensure_ascii=False) |
|
|
|
print(f"๐ ์ ์ฒด ๊ฒฐ๊ณผ ์์ฝ ์ ์ฅ: {summary_file}") |
|
print(f"\n๐ ๋ชจ๋ ๊ฒฐ๊ณผ ํ์ผ ์ ์ฅ ์๋ฃ: {output_dir}") |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description='Test Complete TestTime RLVR Pipeline') |
|
parser.add_argument('--model', type=str, default='Qwen/Qwen2.5-7B', |
|
help='Model name to test with') |
|
parser.add_argument('--gpu', type=int, default=0, help='GPU ID to use') |
|
parser.add_argument('--max_tokens', type=int, default=512, help='Max tokens for generation') |
|
parser.add_argument('--benchmark', type=str, default='test', |
|
choices=['test', 'humaneval', 'mbpp'], |
|
help='Benchmark to use (test=example data, humaneval=HumanEval+, mbpp=MBPP+)') |
|
parser.add_argument('--problem_id', type=str, default='test/simple_sum', |
|
help='Problem ID to test (e.g., HumanEval/0, Mbpp/2)') |
|
parser.add_argument('--output_dir', type=str, default='../tmp', |
|
help='Output directory for detailed results') |
|
parser.add_argument('--verbose', action='store_true', help='Verbose logging') |
|
|
|
args = parser.parse_args() |
|
|
|
|
|
device = f'cuda:{args.gpu}' if torch.cuda.is_available() else 'cpu' |
|
print(f"๐ฏ Using device: {device}") |
|
|
|
|
|
config = TestTimeConfig( |
|
model_name=args.model, |
|
max_adaptation_steps=3, |
|
learning_rate=1e-5, |
|
task_distribution={'induction': 0.4, 'deduction': 0.3, 'abduction': 0.3}, |
|
adaptation_batch_size=1, |
|
max_tasks_per_type=3, |
|
use_flash_attention=False, |
|
torch_dtype=torch.float16, |
|
enable_gradient_checkpointing=False |
|
) |
|
|
|
|
|
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
|
|
if args.benchmark == 'humaneval': |
|
benchmark_config = BenchmarkConfig.get_humaneval_config() |
|
benchmark_config.data_path = os.path.join(base_dir, 'evaluation/code_eval/data/HumanEvalPlus.jsonl') |
|
elif args.benchmark == 'mbpp': |
|
benchmark_config = BenchmarkConfig.get_mbpp_config() |
|
benchmark_config.data_path = os.path.join(base_dir, 'evaluation/code_eval/data/MbppPlus.jsonl') |
|
else: |
|
benchmark_config = BenchmarkConfig( |
|
name='test_humaneval', |
|
data_path='test_data', |
|
problem_prefix='TestEval', |
|
max_problems=1, |
|
test_timeout=30 |
|
) |
|
|
|
|
|
logger = TestTimeLogger(log_level='DEBUG' if args.verbose else 'INFO') |
|
logger.log_info("๐ Starting Complete TestTime RLVR Pipeline Test") |
|
logger.log_info(f"๐ Model: {args.model}") |
|
logger.log_info(f"๐ฏ Device: {device}") |
|
|
|
try: |
|
|
|
logger.log_info("๐ฆ Loading model and tokenizer with VLLM optimization...") |
|
model, tokenizer = InitialSolutionGenerator.load_model_with_optimizations( |
|
args.model, device, config, use_vllm=True |
|
) |
|
logger.log_info("โ
Model loaded successfully") |
|
|
|
|
|
logger.log_info("๐ง Initializing pipeline...") |
|
pipeline = CompleteTestTimePipeline(model, tokenizer, config, logger) |
|
logger.log_info("โ
Pipeline initialized") |
|
|
|
|
|
problem_id = args.problem_id |
|
|
|
logger.log_info(f"๐ Testing with {args.benchmark} benchmark") |
|
logger.log_info(f"๐ Problem ID: {problem_id}") |
|
|
|
|
|
if args.benchmark == 'test': |
|
test_problem = load_test_problem() |
|
logger.log_info(f"๐ Problem preview: {test_problem['prompt'][:100]}...") |
|
|
|
pipeline.benchmark_loader.load_problem = lambda cfg, pid: test_problem |
|
else: |
|
|
|
temp_problem = pipeline.benchmark_loader.load_problem(benchmark_config, problem_id) |
|
|
|
|
|
azr_prompt = f"Please provide a self-contained Python script that solves the following problem in a markdown code block:\n\n{temp_problem.get('prompt', 'No prompt available')}" |
|
|
|
print(f"\n๐ **ORIGINAL PROBLEM:**") |
|
print("="*80) |
|
print(temp_problem.get('prompt', 'No prompt available')) |
|
print("="*80) |
|
|
|
print(f"\n๐ **AZR CODE EVALUATION PROMPT (์ค์ ์ฌ์ฉ๋๋ ํ๋กฌํํธ):**") |
|
print("="*80) |
|
print(azr_prompt) |
|
print("="*80) |
|
|
|
print(f"๐ Entry Point: {temp_problem.get('entry_point', 'N/A')}") |
|
print(f"๐ Task ID: {temp_problem.get('task_id', 'N/A')}") |
|
if 'test' in temp_problem: |
|
print(f"๐ Test Preview: {str(temp_problem['test'])[:200]}...") |
|
print("="*80) |
|
|
|
|
|
logger.log_info("๐โโ๏ธ Running complete pipeline...") |
|
print("\n" + "="*60) |
|
print("๐ COMPLETE TESTTIME RLVR PIPELINE EXECUTION") |
|
print(f"๐ Benchmark: {args.benchmark}") |
|
print(f"๐ Problem: {problem_id}") |
|
print("="*60) |
|
|
|
result = pipeline.run_complete_pipeline(benchmark_config, problem_id) |
|
|
|
print("\n" + "="*60) |
|
print("๐ PIPELINE EXECUTION RESULTS") |
|
print("="*60) |
|
|
|
|
|
print(f"โ
Success: {result['success']}") |
|
if result['error']: |
|
print(f"โ Error: {result['error']}") |
|
|
|
print(f"๐ Problem: {result['problem_id']}") |
|
print(f"๐ท๏ธ Benchmark: {result['benchmark']}") |
|
|
|
|
|
for step_name, step_result in result['steps'].items(): |
|
print(f"\n๐ Step: {step_name.replace('_', ' ').title()}") |
|
print(f" Success: {'โ
' if step_result['success'] else 'โ'}") |
|
|
|
if step_name == 'llm_generation': |
|
solution = step_result.get('solution', '') |
|
print(f" Solution preview: {solution[:100]}...") |
|
print(f" Syntax valid: {'โ
' if step_result.get('syntax_valid') else 'โ'}") |
|
|
|
|
|
eval_result = step_result.get('solution_evaluation') |
|
if eval_result: |
|
if eval_result['correct']: |
|
print(f" โ
Solution CORRECT ({eval_result['passed_tests']}/{eval_result['total_tests']} tests passed)") |
|
else: |
|
print(f" โ Solution INCORRECT ({eval_result['passed_tests']}/{eval_result['total_tests']} tests passed)") |
|
if eval_result.get('error'): |
|
print(f" Error: {eval_result['error'][:80]}...") |
|
|
|
elif step_name == 'ipo_extraction': |
|
print(f" IPO triples extracted: {step_result.get('num_triples', 0)}") |
|
|
|
elif step_name == 'task_generation': |
|
print(f" Total tasks generated: {step_result.get('total_tasks', 0)}") |
|
for task_type, count in step_result.get('tasks_by_type', {}).items(): |
|
print(f" {task_type}: {count}") |
|
|
|
elif step_name == 'task_evaluation': |
|
evaluations = step_result.get('evaluations', {}) |
|
total_evaluated = sum(len(evals) for evals in evaluations.values()) |
|
print(f" Tasks evaluated: {total_evaluated}") |
|
|
|
elif step_name == 'reward_computation': |
|
rewards = step_result.get('rewards', {}) |
|
print(f" Average reward: {rewards.get('average_reward', 0.0):.3f}") |
|
print(f" Total tasks scored: {rewards.get('total_tasks', 0)}") |
|
|
|
|
|
for task_type, type_rewards in rewards.get('rewards_by_type', {}).items(): |
|
print(f" {task_type.title()} Tasks:") |
|
for reward in type_rewards[:2]: |
|
print(f" Task {reward['task_id']}: Expected='{reward['expected_solution'][:50]}...' | Extracted='{reward['extracted_answer'][:50]}...' | Match={'โ
' if reward['basic_accuracy'] > 0 else 'โ'}") |
|
|
|
|
|
if args.verbose and result['success']: |
|
print("\n" + "="*60) |
|
print("๐ DETAILED RESULTS (VERBOSE MODE)") |
|
print("="*60) |
|
|
|
|
|
if 'ipo_extraction' in result['steps']: |
|
ipo_step = result['steps']['ipo_extraction'] |
|
triples = ipo_step.get('triples', []) |
|
print(f"\n๐ IPO Triples ({len(triples)}):") |
|
for i, triple in enumerate(triples[:3]): |
|
print(f" [{i+1}] Input: {str(triple.get('input', 'N/A'))[:50]}...") |
|
print(f" Output: {str(triple.get('output', 'N/A'))[:50]}...") |
|
|
|
|
|
if 'task_generation' in result['steps']: |
|
task_step = result['steps']['task_generation'] |
|
all_tasks = task_step.get('all_tasks', {}) |
|
print(f"\n๐ฏ Generated Tasks:") |
|
for task_type, tasks in all_tasks.items(): |
|
print(f" {task_type.title()} Tasks ({len(tasks)}):") |
|
for i, task in enumerate(tasks[:2]): |
|
prompt = task.get('prompt', '') |
|
print(f" [{i+1}] {prompt[:80]}...") |
|
|
|
|
|
if 'reward_computation' in result['steps']: |
|
reward_step = result['steps']['reward_computation'] |
|
rewards = reward_step.get('rewards', {}) |
|
distribution = rewards.get('reward_distribution', {}) |
|
print(f"\n๐ Reward Distribution:") |
|
for task_type, avg_reward in distribution.items(): |
|
print(f" {task_type.title()}: {avg_reward:.3f}") |
|
|
|
print("\n" + "="*60) |
|
print("๐ PIPELINE TEST COMPLETED SUCCESSFULLY") |
|
print("="*60) |
|
|
|
|
|
if result['success']: |
|
print(f"\n๐ ์์ธ ๊ฒฐ๊ณผ ํ์ผ ์ ์ฅ ์ค...") |
|
save_detailed_results(result, args, args.output_dir) |
|
|
|
return result['success'] |
|
|
|
except Exception as e: |
|
logger.log_error(f"๐ฅ Pipeline test failed: {e}") |
|
import traceback |
|
traceback.print_exc() |
|
return False |
|
|
|
finally: |
|
|
|
if torch.cuda.is_available(): |
|
torch.cuda.empty_cache() |
|
logger.log_info("๐งน Cleaned up resources") |
|
|
|
|
|
if __name__ == '__main__': |
|
success = main() |
|
exit_code = 0 if success else 1 |
|
print(f"\n๐ช Exiting with code {exit_code}") |
|
sys.exit(exit_code) |