File size: 6,574 Bytes

f50dc54

"""
TestTime RLVR Configuration

AZR 기반 TestTime RLVR을 위한 설정 클래스
"""

from dataclasses import dataclass
from typing import Optional, List, Dict, Any
import torch


@dataclass
class TestTimeConfig:
    """TestTime RLVR 전용 설정"""
    
    # ============================================================================
    # 기본 모델 설정 (AZR 기반)
    # ============================================================================
    model_name: str = "Qwen/Qwen2.5-7B"
    device: str = "auto"
    torch_dtype: torch.dtype = torch.bfloat16
    use_flash_attention: bool = True
    enable_gradient_checkpointing: bool = True
    
    # ============================================================================
    # TestTime 학습 설정
    # ============================================================================
    max_adaptation_steps: int = 10  # AZR 대비 짧은 적응 학습
    adaptation_batch_size: int = 1  # 소규모 배치
    gradient_accumulation_steps: int = 4
    learning_rate: float = 1e-6  # AZR과 동일
    
    # ============================================================================
    # 반복 제어 설정  
    # ============================================================================
    max_cycles: int = 3  # 최대 반복 횟수
    min_improvement_threshold: float = 0.05  # 최소 개선 임계값
    early_stopping_patience: int = 2  # Early stopping
    
    # ============================================================================
    # IPO 추출 설정
    # ============================================================================
    max_ipo_triples: int = 10  # 추출할 최대 트리플 수
    python_executor_timeout: int = 5  # AZR보다 짧은 타임아웃
    validate_triples: bool = True  # 트리플 검증 여부
    
    # ============================================================================
    # 다중 프로그램 생성 설정
    # ============================================================================
    num_program_variations: int = 4  # 생성할 다양한 프로그램 수
    baseline_evaluation_rounds: int = 5  # 베이스라인 성능 측정 횟수
    diverse_generation_temperature: float = 0.7  # 다양한 프로그램 생성용 temperature
    baseline_generation_temperature: float = 0.05  # 베이스라인 측정용 temperature
    
    # ============================================================================
    # 태스크 생성 설정
    # ============================================================================
    task_distribution: Dict[str, float] = None  # induction:deduction:abduction 비율
    max_tasks_per_type: int = 5  # 타입별 최대 태스크 수
    use_azr_templates: bool = True  # AZR 템플릿 사용
    skip_task_evaluation: bool = True  # Task evaluation(4단계) 스킵 여부 (VeRL에서 수행)
    
    # ============================================================================
    # 보상 설정 (AZR 기반)
    # ============================================================================
    use_accuracy_reward: bool = True
    use_improvement_reward: bool = True  # TestTime 전용 개선도 보상
    use_complexity_reward: bool = True
    accuracy_weight: float = 1.0
    improvement_weight: float = 0.5  # 개선도 가중치
    complexity_weight: float = 0.1
    
    # ============================================================================
    # 로깅 설정
    # ============================================================================
    log_level: str = "INFO"
    save_intermediate_results: bool = True
    log_ipo_details: bool = True
    log_task_details: bool = True
    log_training_metrics: bool = True
    
    # ============================================================================
    # 메모리 최적화 설정 (AZR 기반)
    # ============================================================================
    gpu_memory_utilization: float = 0.4
    max_workers: int = 2  # Python executor workers
    use_memory_efficient_attention: bool = True
    
    def __post_init__(self):
        """설정 후처리"""
        if self.task_distribution is None:
            # 기본 태스크 분포: 균등 분배
            self.task_distribution = {
                "induction": 0.33,
                "deduction": 0.33, 
                "abduction": 0.34
            }
        
        # device 자동 설정
        if self.device == "auto":
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
            
        # dtype 설정
        if self.device == "cpu":
            self.torch_dtype = torch.float32
    
    def to_dict(self) -> Dict[str, Any]:
        """설정을 딕셔너리로 변환"""
        return {
            "model_name": self.model_name,
            "device": self.device,
            "torch_dtype": str(self.torch_dtype),
            "max_adaptation_steps": self.max_adaptation_steps,
            "max_cycles": self.max_cycles,
            "learning_rate": self.learning_rate,
            "task_distribution": self.task_distribution,
            "reward_weights": {
                "accuracy": self.accuracy_weight,
                "improvement": self.improvement_weight,
                "complexity": self.complexity_weight
            }
        }
    
    @classmethod
    def from_dict(cls, config_dict: Dict[str, Any]) -> 'TestTimeConfig':
        """딕셔너리에서 설정 로드"""
        return cls(**config_dict)


@dataclass  
class BenchmarkConfig:
    """벤치마크별 설정"""
    
    name: str  # "humaneval", "mbpp", "livecodebase"
    data_path: str
    problem_prefix: str  # "HumanEval", "Mbpp" 
    start_index: int = 0  # MBPP는 2부터 시작
    max_problems: int = 5  # 테스트할 문제 수
    
    # 벤치마크별 특화 설정
    test_timeout: int = 10
    use_plus_version: bool = True  # HumanEval+, MBPP+ 사용
    
    @classmethod
    def get_humaneval_config(cls) -> 'BenchmarkConfig':
        return cls(
            name="humaneval",
            data_path="evaluation/code_eval/data/HumanEvalPlus.jsonl",
            problem_prefix="HumanEval",
            start_index=0,
            max_problems=5
        )
    
    @classmethod 
    def get_mbpp_config(cls) -> 'BenchmarkConfig':
        return cls(
            name="mbpp", 
            data_path="evaluation/code_eval/data/MbppPlus.jsonl",
            problem_prefix="Mbpp",
            start_index=2,  # MBPP는 2번부터
            max_problems=5
        )