neural-mesh / test /validate_environment.py

Upload TestTime-RLVR-v2 from Full-pipeline-relative_0827 branch

f50dc54 verified 23 days ago

15.1 kB

	#!/usr/bin/env python3
	"""
	TTRLVR + AZR 환경 검증 스크립트

	실제 실행 환경에서 필요한 모든 컴포넌트가 올바르게 설정되어 있는지 확인합니다:
	1. Python 패키지 및 버전 확인
	2. GPU 및 CUDA 환경 확인
	3. 파일 경로 및 권한 확인
	4. 모델 로딩 테스트
	5. AZR 설정 파일 검증
	6. 간단한 파이프라인 실행 테스트
	"""

	import os
	import sys
	import json
	import subprocess
	import tempfile
	import traceback
	from pathlib import Path
	from datetime import datetime

	# 경로 설정
	sys.path.append('/home/ubuntu/RLVR/TestTime-RLVR-v2')

	class EnvironmentValidator:
	"""환경 검증 클래스"""

	def __init__(self):
	self.results = {
	'timestamp': datetime.now().isoformat(),
	'tests': {},
	'overall_success': False,
	'recommendations': []
	}

	def log_test(self, test_name: str, success: bool, message: str, details: str = None):
	"""테스트 결과 로깅"""

	status = "✅ PASS" if success else "❌ FAIL"
	print(f"{status} {test_name}: {message}")

	if details:
	print(f" Details: {details}")

	self.results['tests'][test_name] = {
	'success': success,
	'message': message,
	'details': details
	}

	if not success:
	print()

	def add_recommendation(self, recommendation: str):
	"""권장사항 추가"""
	self.results['recommendations'].append(recommendation)
	print(f"💡 Recommendation: {recommendation}")

	def test_python_packages(self):
	"""Python 패키지 확인"""

	required_packages = {
	'torch': '2.0.0',
	'transformers': '4.30.0',
	'pandas': '1.5.0',
	'numpy': '1.21.0',
	'vllm': '0.3.0'
	}

	missing_packages = []
	version_issues = []

	for package, min_version in required_packages.items():
	try:
	if package == 'vllm':
	# vLLM은 선택적 패키지
	try:
	import vllm
	version = vllm.__version__
	except ImportError:
	self.add_recommendation(f"Consider installing vLLM for better GPU performance: pip install vllm")
	continue
	else:
	exec(f"import {package}")
	version = eval(f"{package}.__version__")

	# 버전 비교는 간단히 문자열로 (정확한 비교는 packaging 모듈 필요)
	if version < min_version:
	version_issues.append(f"{package}: {version} < {min_version}")

	except ImportError:
	missing_packages.append(package)
	except Exception as e:
	version_issues.append(f"{package}: Error checking version - {e}")

	if missing_packages:
	self.log_test(
	"Python Packages",
	False,
	f"Missing packages: {', '.join(missing_packages)}",
	f"Install with: pip install {' '.join(missing_packages)}"
	)
	return False
	elif version_issues:
	self.log_test(
	"Python Packages",
	False,
	f"Version issues: {', '.join(version_issues)}",
	"Update packages to meet minimum requirements"
	)
	return False
	else:
	self.log_test("Python Packages", True, "All required packages installed")
	return True

	def test_gpu_environment(self):
	"""GPU 및 CUDA 환경 확인"""

	try:
	import torch

	# CUDA 사용 가능성 확인
	if not torch.cuda.is_available():
	self.log_test("GPU Environment", False, "CUDA not available")
	self.add_recommendation("Install CUDA toolkit and PyTorch with CUDA support")
	return False

	# GPU 개수 및 메모리 확인
	gpu_count = torch.cuda.device_count()
	current_device = torch.cuda.current_device()
	device_name = torch.cuda.get_device_name(current_device)

	# 메모리 정보
	memory_allocated = torch.cuda.memory_allocated() / 1024**3 # GB
	memory_reserved = torch.cuda.memory_reserved() / 1024**3 # GB
	memory_total = torch.cuda.get_device_properties(current_device).total_memory / 1024**3 # GB

	details = f"GPUs: {gpu_count}, Current: {device_name}, Memory: {memory_total:.1f}GB total, {memory_reserved:.1f}GB reserved"

	if memory_total < 8.0: # 8GB 미만
	self.log_test("GPU Environment", False, f"GPU memory insufficient: {memory_total:.1f}GB", details)
	self.add_recommendation("Use a GPU with at least 8GB VRAM for 7B models")
	return False

	self.log_test("GPU Environment", True, f"GPU environment ready", details)
	return True

	except Exception as e:
	self.log_test("GPU Environment", False, f"Error checking GPU: {e}")
	return False

	def test_file_paths_and_permissions(self):
	"""파일 경로 및 권한 확인"""

	critical_paths = {
	'/home/ubuntu/RLVR/TestTime-RLVR-v2': 'Main project directory',
	'/home/ubuntu/RLVR/TestTime-RLVR-v2/test/configs/ttrlvr_azr_7b_single_gpu.sh': 'AZR config script',
	'/data/RLVR/checkpoints': 'Checkpoint directory (will be created)',
	'/tmp': 'Temporary directory'
	}

	issues = []

	for path, description in critical_paths.items():
	if not os.path.exists(path):
	if 'checkpoints' in path:
	# 체크포인트 디렉토리는 생성 시도
	try:
	os.makedirs(path, exist_ok=True)
	self.log_test(f"Path: {description}", True, f"Created directory: {path}")
	except Exception as e:
	issues.append(f"{description}: Cannot create {path} - {e}")
	else:
	issues.append(f"{description}: Not found - {path}")
	else:
	# 읽기/쓰기 권한 확인
	readable = os.access(path, os.R_OK)
	writable = os.access(path, os.W_OK)

	if not readable:
	issues.append(f"{description}: No read permission - {path}")
	elif os.path.isdir(path) and not writable:
	issues.append(f"{description}: No write permission - {path}")
	else:
	self.log_test(f"Path: {description}", True, f"Accessible: {path}")

	if issues:
	self.log_test("File Paths", False, f"{len(issues)} path issues", "; ".join(issues))
	return False
	else:
	self.log_test("File Paths", True, "All critical paths accessible")
	return True

	def test_model_loading(self):
	"""모델 로딩 테스트 (간단한 확인)"""

	try:
	# 빠른 테스트를 위해 transformers 라이브러리만 확인
	from transformers import AutoTokenizer

	# 실제 모델 로딩 대신 라이브러리 기능만 테스트
	self.log_test("Model Loading", True, "Transformers library available for model loading")
	self.add_recommendation("Model loading test skipped to avoid timeout. Run full model test separately if needed.")
	return True

	except Exception as e:
	self.log_test("Model Loading", False, f"Failed to import transformers: {e}")
	self.add_recommendation("Install transformers library: pip install transformers")
	return False

	def test_azr_config(self):
	"""AZR 설정 파일 검증"""

	config_path = '/home/ubuntu/RLVR/TestTime-RLVR-v2/test/configs/ttrlvr_azr_7b_single_gpu.sh'

	try:
	if not os.path.exists(config_path):
	self.log_test("AZR Config", False, f"Config file not found: {config_path}")
	return False

	# 스크립트 실행 권한 확인
	if not os.access(config_path, os.X_OK):
	self.log_test("AZR Config", False, f"Config file not executable: {config_path}")
	self.add_recommendation(f"Make config executable: chmod +x {config_path}")
	return False

	# 설정 파일 내용 기본 검증
	with open(config_path, 'r') as f:
	content = f.read()

	required_settings = [
	'trainer.project_name=ttrlvr_azr',
	'azr.train_propose=False',
	'data.train_batch_size=8',
	'actor_rollout_ref.actor.ppo_mini_batch_size=24'
	]

	missing_settings = []
	for setting in required_settings:
	if setting not in content:
	missing_settings.append(setting)

	if missing_settings:
	self.log_test(
	"AZR Config",
	False,
	f"Missing settings: {', '.join(missing_settings)}",
	f"Check config file: {config_path}"
	)
	return False

	self.log_test("AZR Config", True, f"Config file validated: {config_path}")
	return True

	except Exception as e:
	self.log_test("AZR Config", False, f"Error validating config: {e}")
	return False

	def test_simple_pipeline(self):
	"""간단한 파이프라인 실행 테스트"""

	try:
	from absolute_zero_reasoner.testtime.config import TestTimeConfig
	from absolute_zero_reasoner.testtime.logger import TestTimeLogger
	from absolute_zero_reasoner.testtime.task_generator import TestTimeTaskGenerator

	# 기본 설정 생성
	config = TestTimeConfig()
	config.model_name = "Qwen/Qwen2.5-7B"
	logger = TestTimeLogger()

	# Task Generator 인스턴스 생성
	task_generator = TestTimeTaskGenerator(config, logger)

	# 테스트용 IPO 트리플
	test_ipo_triples = [
	{
	'id': 'test_triple_0',
	'input': '[1, 2, 3]',
	'actual_output': '[2, 4, 6]',
	'program': 'def test_func(lst):\n return [x * 2 for x in lst]',
	'full_input_str': 'test_func([1, 2, 3])',
	'source_program_id': 'program_0',
	'ipo_index': 0
	}
	]

	# Task 생성 테스트
	tasks = task_generator.generate_tasks(test_ipo_triples, "TestProblem", 1)

	# 결과 검증
	if not tasks or not any(len(task_list) > 0 for task_list in tasks.values()):
	self.log_test("Simple Pipeline", False, "No tasks generated")
	return False

	# AZR 메타데이터 확인
	for task_type, task_list in tasks.items():
	if task_list:
	task = task_list[0]
	required_fields = ['uid', 'ipo_group_id', 'basic_accuracy', 'ground_truth']
	missing_fields = [field for field in required_fields if field not in task]

	if missing_fields:
	self.log_test(
	"Simple Pipeline",
	False,
	f"Missing AZR metadata: {missing_fields}"
	)
	return False

	total_tasks = sum(len(task_list) for task_list in tasks.values())
	self.log_test("Simple Pipeline", True, f"Generated {total_tasks} tasks successfully")
	return True

	except Exception as e:
	self.log_test("Simple Pipeline", False, f"Pipeline test failed: {e}")
	return False

	def run_all_tests(self):
	"""모든 테스트 실행"""

	print("🔍 TTRLVR + AZR 환경 검증 시작")
	print("=" * 60)

	tests = [
	self.test_python_packages,
	self.test_gpu_environment,
	self.test_file_paths_and_permissions,
	self.test_model_loading,
	self.test_azr_config,
	self.test_simple_pipeline
	]

	passed_tests = 0
	total_tests = len(tests)

	for test in tests:
	try:
	if test():
	passed_tests += 1
	print() # 빈 줄 추가
	except Exception as e:
	print(f"❌ Test {test.__name__} crashed: {e}")
	print(f" Traceback: {traceback.format_exc()}")
	print()

	# 최종 결과
	success_rate = passed_tests / total_tests * 100
	self.results['overall_success'] = passed_tests == total_tests

	print("=" * 60)
	print("📊 환경 검증 결과:")
	print(f" - 통과한 테스트: {passed_tests}/{total_tests} ({success_rate:.1f}%)")

	if self.results['recommendations']:
	print(f"\n💡 권장사항 ({len(self.results['recommendations'])}개):")
	for i, rec in enumerate(self.results['recommendations'], 1):
	print(f" {i}. {rec}")

	if self.results['overall_success']:
	print("\n🎉 환경 검증 완료! TTRLVR + AZR 실행 준비가 완료되었습니다.")
	else:
	print(f"\n⚠️ 환경 검증 실패: {total_tests - passed_tests}개 테스트 실패")
	print(" 위의 권장사항을 참고하여 문제를 해결한 후 다시 시도하세요.")

	return self.results


	def main():
	"""메인 실행 함수"""

	validator = EnvironmentValidator()
	results = validator.run_all_tests()

	# 결과를 파일로 저장
	output_file = f"/tmp/ttrlvr_azr_validation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
	with open(output_file, 'w') as f:
	json.dump(results, f, indent=2)

	print(f"\n📄 상세 결과 저장: {output_file}")

	return 0 if results['overall_success'] else 1


	if __name__ == '__main__':
	sys.exit(main())