File size: 2,693 Bytes

f50dc54

#!/usr/bin/env python3
"""
EvalPlus 직접 사용 테스트
"""

import json
import sys
from pathlib import Path

# Add paths
sys.path.append(str(Path(__file__).parent))
sys.path.insert(0, "/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/coding")

# Debug print
print(f"Python path: {sys.path[:3]}")
try:
    import evalplus
    print("EvalPlus imported successfully!")
except ImportError as e:
    print(f"Failed to import evalplus: {e}")

from absolute_zero_reasoner.testtime.solution_generator import InitialSolutionGenerator
from absolute_zero_reasoner.testtime.config import TestTimeConfig
from absolute_zero_reasoner.testtime.logger import TestTimeLogger

def load_single_problem(task_id: str = "Mbpp/2"):
    """단일 MBPP 문제 로드"""
    dataset_path = Path("/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/data/MbppPlus.jsonl")
    
    with open(dataset_path, 'r') as f:
        for line in f:
            problem = json.loads(line.strip())
            if problem['task_id'] == task_id:
                return problem
    
    raise ValueError(f"Problem {task_id} not found")

def test_evalplus_direct():
    """EvalPlus 직접 사용 테스트"""
    # 설정
    config = TestTimeConfig()
    logger = TestTimeLogger()
    
    # 간단한 모의 모델과 토크나이저
    class MockModel:
        def device(self):
            return 'cpu'
    
    class MockTokenizer:
        eos_token = '</s>'
        pad_token = '</s>'
    
    # 솔루션 생성기 초기화
    generator = InitialSolutionGenerator(
        model=MockModel(),
        tokenizer=MockTokenizer(),
        config=config,
        logger=logger,
        use_vllm=False
    )
    
    # 문제 로드
    problem = load_single_problem("Mbpp/2")
    print(f"Testing problem: {problem['task_id']}")
    print(f"Entry point: {problem['entry_point']}")
    print(f"Base inputs: {len(problem.get('base_input', []))}")
    print(f"Plus inputs: {len(problem.get('plus_input', []))}")
    
    # 정답 솔루션으로 테스트
    solution = problem['canonical_solution']
    print("\nTesting with canonical solution:")
    print(solution)
    
    # 평가 실행
    result = generator.evaluate_solution(problem, solution)
    
    # 결과 출력
    print("\n=== Evaluation Result (Canonical) ===")
    print(f"Correct: {result['correct']}")
    print(f"Base passed: {result['base_passed']}/{result['base_total']}")
    print(f"Plus passed: {result['plus_passed']}/{result['plus_total']}")
    print(f"Total passed: {result['passed_tests']}/{result['total_tests']}")
    if result['error']:
        print(f"Error: {result['error']}")

if __name__ == "__main__":
    test_evalplus_direct()