Upload TestTime-RLVR-v2 from Full-pipeline-relative_0827 branch

f50dc54 verified 28 days ago

6.57 kB

	"""
	TestTime RLVR Configuration

	AZR 기반 TestTime RLVR을 위한 설정 클래스
	"""

	from dataclasses import dataclass
	from typing import Optional, List, Dict, Any
	import torch


	@dataclass
	class TestTimeConfig:
	"""TestTime RLVR 전용 설정"""

	# ============================================================================
	# 기본 모델 설정 (AZR 기반)
	# ============================================================================
	model_name: str = "Qwen/Qwen2.5-7B"
	device: str = "auto"
	torch_dtype: torch.dtype = torch.bfloat16
	use_flash_attention: bool = True
	enable_gradient_checkpointing: bool = True

	# ============================================================================
	# TestTime 학습 설정
	# ============================================================================
	max_adaptation_steps: int = 10 # AZR 대비 짧은 적응 학습
	adaptation_batch_size: int = 1 # 소규모 배치
	gradient_accumulation_steps: int = 4
	learning_rate: float = 1e-6 # AZR과 동일

	# ============================================================================
	# 반복 제어 설정
	# ============================================================================
	max_cycles: int = 3 # 최대 반복 횟수
	min_improvement_threshold: float = 0.05 # 최소 개선 임계값
	early_stopping_patience: int = 2 # Early stopping

	# ============================================================================
	# IPO 추출 설정
	# ============================================================================
	max_ipo_triples: int = 10 # 추출할 최대 트리플 수
	python_executor_timeout: int = 5 # AZR보다 짧은 타임아웃
	validate_triples: bool = True # 트리플 검증 여부

	# ============================================================================
	# 다중 프로그램 생성 설정
	# ============================================================================
	num_program_variations: int = 4 # 생성할 다양한 프로그램 수
	baseline_evaluation_rounds: int = 5 # 베이스라인 성능 측정 횟수
	diverse_generation_temperature: float = 0.7 # 다양한 프로그램 생성용 temperature
	baseline_generation_temperature: float = 0.05 # 베이스라인 측정용 temperature

	# ============================================================================
	# 태스크 생성 설정
	# ============================================================================
	task_distribution: Dict[str, float] = None # induction:deduction:abduction 비율
	max_tasks_per_type: int = 5 # 타입별 최대 태스크 수
	use_azr_templates: bool = True # AZR 템플릿 사용
	skip_task_evaluation: bool = True # Task evaluation(4단계) 스킵 여부 (VeRL에서 수행)

	# ============================================================================
	# 보상 설정 (AZR 기반)
	# ============================================================================
	use_accuracy_reward: bool = True
	use_improvement_reward: bool = True # TestTime 전용 개선도 보상
	use_complexity_reward: bool = True
	accuracy_weight: float = 1.0
	improvement_weight: float = 0.5 # 개선도 가중치
	complexity_weight: float = 0.1

	# ============================================================================
	# 로깅 설정
	# ============================================================================
	log_level: str = "INFO"
	save_intermediate_results: bool = True
	log_ipo_details: bool = True
	log_task_details: bool = True
	log_training_metrics: bool = True

	# ============================================================================
	# 메모리 최적화 설정 (AZR 기반)
	# ============================================================================
	gpu_memory_utilization: float = 0.4
	max_workers: int = 2 # Python executor workers
	use_memory_efficient_attention: bool = True

	def __post_init__(self):
	"""설정 후처리"""
	if self.task_distribution is None:
	# 기본 태스크 분포: 균등 분배
	self.task_distribution = {
	"induction": 0.33,
	"deduction": 0.33,
	"abduction": 0.34
	}

	# device 자동 설정
	if self.device == "auto":
	self.device = "cuda" if torch.cuda.is_available() else "cpu"

	# dtype 설정
	if self.device == "cpu":
	self.torch_dtype = torch.float32

	def to_dict(self) -> Dict[str, Any]:
	"""설정을 딕셔너리로 변환"""
	return {
	"model_name": self.model_name,
	"device": self.device,
	"torch_dtype": str(self.torch_dtype),
	"max_adaptation_steps": self.max_adaptation_steps,
	"max_cycles": self.max_cycles,
	"learning_rate": self.learning_rate,
	"task_distribution": self.task_distribution,
	"reward_weights": {
	"accuracy": self.accuracy_weight,
	"improvement": self.improvement_weight,
	"complexity": self.complexity_weight
	}
	}

	@classmethod
	def from_dict(cls, config_dict: Dict[str, Any]) -> 'TestTimeConfig':
	"""딕셔너리에서 설정 로드"""
	return cls(**config_dict)


	@dataclass
	class BenchmarkConfig:
	"""벤치마크별 설정"""

	name: str # "humaneval", "mbpp", "livecodebase"
	data_path: str
	problem_prefix: str # "HumanEval", "Mbpp"
	start_index: int = 0 # MBPP는 2부터 시작
	max_problems: int = 5 # 테스트할 문제 수

	# 벤치마크별 특화 설정
	test_timeout: int = 10
	use_plus_version: bool = True # HumanEval+, MBPP+ 사용

	@classmethod
	def get_humaneval_config(cls) -> 'BenchmarkConfig':
	return cls(
	name="humaneval",
	data_path="evaluation/code_eval/data/HumanEvalPlus.jsonl",
	problem_prefix="HumanEval",
	start_index=0,
	max_problems=5
	)

	@classmethod
	def get_mbpp_config(cls) -> 'BenchmarkConfig':
	return cls(
	name="mbpp",
	data_path="evaluation/code_eval/data/MbppPlus.jsonl",
	problem_prefix="Mbpp",
	start_index=2, # MBPP는 2번부터
	max_problems=5
	)