#!/usr/bin/env python3 """ EvalPlus 방식으로 단일 MBPP 문제 테스트 """ import json import sys from pathlib import Path # Add parent directory to path sys.path.append(str(Path(__file__).parent)) from absolute_zero_reasoner.testtime.solution_generator import InitialSolutionGenerator from absolute_zero_reasoner.testtime.config import TestTimeConfig from absolute_zero_reasoner.testtime.logger import TestTimeLogger def load_single_problem(task_id: str = "Mbpp/2"): """단일 MBPP 문제 로드""" dataset_path = Path("/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/data/MbppPlus.jsonl") with open(dataset_path, 'r') as f: for line in f: problem = json.loads(line.strip()) if problem['task_id'] == task_id: return problem raise ValueError(f"Problem {task_id} not found") def test_evalplus_evaluation(): """EvalPlus 스타일 평가 테스트""" # 설정 config = TestTimeConfig() logger = TestTimeLogger() # 간단한 모의 모델과 토크나이저 (실제 평가는 안함) class MockModel: def device(self): return 'cpu' class MockTokenizer: eos_token = '' pad_token = '' # 솔루션 생성기 초기화 generator = InitialSolutionGenerator( model=MockModel(), tokenizer=MockTokenizer(), config=config, logger=logger, use_vllm=False ) # 문제 로드 problem = load_single_problem("Mbpp/2") print(f"Testing problem: {problem['task_id']}") print(f"Entry point: {problem['entry_point']}") print(f"Base inputs: {len(problem.get('base_input', []))}") print(f"Plus inputs: {len(problem.get('plus_input', []))}") # 정답 솔루션으로 테스트 solution = problem['canonical_solution'] print("\nTesting with canonical solution:") print(solution[:200] + "..." if len(solution) > 200 else solution) # 평가 실행 result = generator.evaluate_solution(problem, solution) # 결과 출력 print("\n=== Evaluation Result ===") print(f"Correct: {result['correct']}") print(f"Base passed: {result['base_passed']}/{result['base_total']}") print(f"Plus passed: {result['plus_passed']}/{result['plus_total']}") print(f"Total passed: {result['passed_tests']}/{result['total_tests']}") if result['error']: print(f"Error: {result['error']}") # 잘못된 솔루션으로도 테스트 wrong_solution = """ def similar_elements(test_tup1, test_tup2): # 의도적으로 잘못된 솔루션 - 교집합이 아닌 합집합 반환 return tuple(set(test_tup1) | set(test_tup2)) """ print("\n\nTesting with wrong solution:") print(wrong_solution) result2 = generator.evaluate_solution(problem, wrong_solution) print("\n=== Evaluation Result (Wrong Solution) ===") print(f"Correct: {result2['correct']}") print(f"Base passed: {result2['base_passed']}/{result2['base_total']}") print(f"Plus passed: {result2['plus_passed']}/{result2['plus_total']}") print(f"Total passed: {result2['passed_tests']}/{result2['total_tests']}") if result2['error']: print(f"Error: {result2['error']}") if __name__ == "__main__": test_evalplus_evaluation()