neural-mesh / absolute_zero_reasoner /testtime /ipo_extractor.py

Upload TestTime-RLVR-v2 from Full-pipeline-relative_0827 branch

f50dc54 verified about 1 month ago

55.2 kB

	"""
	IPO Triple Extractor

	AZR Python Executor 기반 (Input, Program, Output) 트리플 추출 시스템
	요구사항 2: "AZR Python Executor를 이용하여 (i,p,o) pair를 만든다"
	"""

	import ast
	import re
	import json
	from typing import Dict, List, Any, Tuple, Optional
	from concurrent.futures import TimeoutError

	from ..utils.code_utils.python_executor import PythonExecutor
	from .config import TestTimeConfig
	from .logger import TestTimeLogger
	from .solution_generator import InitialSolutionGenerator


	class IPOBuffer:
	"""IPO triple을 저장하고 관리하는 버퍼"""

	def __init__(self):
	self.buffer = {} # {problem_id: [ipo_triples]}

	def add(self, problem_id: str, ipo_triple: Dict[str, Any]):
	"""IPO triple을 버퍼에 추가"""
	if problem_id not in self.buffer:
	self.buffer[problem_id] = []
	self.buffer[problem_id].append(ipo_triple)

	def get_all(self, problem_id: str) -> List[Dict[str, Any]]:
	"""특정 문제의 모든 IPO triple 반환"""
	return self.buffer.get(problem_id, [])

	def clear(self, problem_id: str = None):
	"""버퍼 초기화"""
	if problem_id:
	self.buffer.pop(problem_id, None)
	else:
	self.buffer.clear()

	def size(self, problem_id: str = None) -> int:
	"""버퍼 크기 반환"""
	if problem_id:
	return len(self.buffer.get(problem_id, []))
	return sum(len(triples) for triples in self.buffer.values())


	class IPOTripleExtractor:
	"""(Input, Program, Output) 트리플 추출 및 검증"""

	def __init__(self, config: TestTimeConfig, logger: Optional[TestTimeLogger] = None,
	model=None, tokenizer=None):
	self.config = config
	self.logger = logger or TestTimeLogger()
	self.model = model
	self.tokenizer = tokenizer

	# AZR Python Executor 초기화 (기존 방식)
	self.executor = PythonExecutor(
	timeout_length=config.python_executor_timeout,
	ast_check=True, # AZR 기본 설정
	max_workers=config.max_workers
	)

	self.extracted_triples = []

	# 입력 생성 프롬프트와 응답 저장용
	self.last_generation_prompt = ""
	self.last_generation_response = ""

	# VLLM 배치 처리를 위한 참조
	self.solution_generator = None

	def extract_triples(self, problem: Dict[str, Any], solution: str) -> List[Dict[str, Any]]:
	"""벤치마크 문제와 솔루션에서 IPO 트리플 추출"""

	problem_id = problem.get('task_id', 'unknown')
	self.logger.log_info(f"🔍 Extracting IPO triples for {problem_id}")

	triples = []

	try:
	# 1. 함수 정보 추출 (entry point 우선)
	entry_point = problem.get('entry_point', 'unknown')
	func_info = self._extract_function_info(solution, entry_point)
	if not func_info:
	self.logger.log_error(f"Failed to extract function info from solution")
	return []

	# 2. 테스트 케이스에서 입력-출력 쌍 생성 (LLM 솔루션 기반)
	test_cases = self._extract_test_cases(problem, solution)

	# 3. 솔루션 실행으로 IPO 트리플 생성
	for i, (test_input_str, expected_output) in enumerate(test_cases):
	if len(triples) >= self.config.max_ipo_triples:
	break

	# test_input_str에서 실제 인자 추출 (예: "strlen('')" -> "''")
	import re
	match = re.match(rf'{entry_point}\((.*)\)', test_input_str)
	if match:
	actual_args = match.group(1)
	else:
	actual_args = test_input_str # fallback

	triple = self._create_ipo_triple(
	func_info['full_code'], # 🔧 수정: 전체 코드 사용 (도우미 함수 포함)
	func_info,
	actual_args, # 실제 인자만 전달
	expected_output,
	triple_id=f"{problem_id}_triple_{i}",
	full_input_str=test_input_str # 전체 입력 문자열도 전달
	)

	if triple:
	triples.append(triple)

	# 🔧 수정: Synthetic 트리플 생성 제거 (단일 예시만 사용하여 치팅 방지)
	# Synthetic 트리플 생성 로직을 제거하여 진짜 단일 예시만 사용

	# 검증 및 로깅
	validation_results = [self._validate_triple(triple) for triple in triples]
	self.logger.log_ipo_extraction(problem_id, triples, validation_results)

	# 유효한 트리플만 반환
	valid_triples = [triple for triple, valid in zip(triples, validation_results) if valid]

	self.logger.log_info(f"✅ Extracted {len(valid_triples)}/{len(triples)} valid IPO triples")
	return valid_triples

	except Exception as e:
	self.logger.log_error(f"IPO extraction failed: {e}")
	return []

	def _extract_function_info(self, solution: str, entry_point: str = None) -> Optional[Dict[str, str]]:
	"""솔루션에서 함수 정보 추출 (entry point 우선)"""

	try:
	# 🔧 개선: Raw LLM response인지 확인하고 함수 코드 추출
	processed_solution = solution
	if "LLM GENERATED SOLUTION:" in solution:
	self.logger.log_info("📝 Raw LLM response detected, extracting function code")
	processed_solution = self._extract_function_from_llm_response(solution)
	if not processed_solution:
	self.logger.log_error("Failed to extract function from LLM response")
	return None

	# AST로 함수 정의 파싱
	tree = ast.parse(processed_solution)

	# 🔧 수정: Entry point 함수 우선 검색
	target_function = None
	all_functions = []

	for node in ast.walk(tree):
	if isinstance(node, ast.FunctionDef):
	func_info = {
	'name': node.name,
	'args': [arg.arg for arg in node.args.args],
	'signature': f"def {node.name}({', '.join([arg.arg for arg in node.args.args])}):",
	'full_code': processed_solution
	}
	all_functions.append(func_info)

	# Entry point와 일치하는 함수 우선 선택
	if entry_point and node.name == entry_point:
	target_function = func_info
	# 이 로그는 너무 자주 출력되므로 debug 레벨로 변경
	self.logger.log_debug(f"🎯 Found entry point function: {entry_point}")
	break

	# Entry point 함수를 찾았으면 반환
	if target_function:
	return target_function

	# Entry point를 찾지 못했으면 첫 번째 함수 반환 (기존 방식)
	if all_functions:
	self.logger.log_warning(f"⚠️ Entry point '{entry_point}' not found, using first function: {all_functions[0]['name']}")
	return all_functions[0]

	return None

	except Exception as e:
	self.logger.log_error(f"Function parsing failed: {e}")
	return None

	def _extract_function_from_llm_response(self, llm_response: str) -> str:
	"""Raw LLM response에서 함수 코드 추출 (solution_generator와 동일한 로직)"""

	lines = llm_response.split('\n')
	solution_lines = []
	in_solution = False

	# "LLM GENERATED SOLUTION:" 섹션 추출 (수정된 로직)
	for i, line in enumerate(lines):
	if "LLM GENERATED SOLUTION:" in line:
	in_solution = True
	continue
	elif in_solution:
	# "===============" 라인이 나오면 종료하되, 첫 번째 "==============="는 건너뛰기
	if "===============" in line:
	# 실제 솔루션 라인들이 있는지 확인
	if solution_lines and any(l.strip() for l in solution_lines):
	break
	else:
	# 아직 솔루션 라인이 없으면 계속 진행 (첫 번째 구분선 건너뛰기)
	continue
	solution_lines.append(line)

	if not solution_lines:
	return "" # 추출 실패시 빈 문자열 반환

	extracted_solution = '\n'.join(solution_lines).strip()

	# 함수 정의와 import 추출 (solution_generator 로직과 동일)
	lines = extracted_solution.split('\n')
	import_lines = []
	func_lines = []
	in_function = False
	indent_level = 0

	# 1. import 문 수집
	for line in lines:
	stripped = line.strip()
	if (stripped.startswith('import ') or stripped.startswith('from ')) and not stripped.startswith('#'):
	import_lines.append(line)

	# 2. 함수 정의 찾기
	for line in lines:
	if line.strip().startswith('def '):
	in_function = True
	func_lines = [line]
	indent_level = len(line) - len(line.lstrip())
	elif in_function:
	if not line.strip() or (line.strip() and len(line) - len(line.lstrip()) > indent_level):
	func_lines.append(line)
	else:
	break

	# 3. import + function 결합
	if func_lines:
	result_lines = import_lines + [''] + func_lines if import_lines else func_lines
	return '\n'.join(result_lines)
	else:
	return extracted_solution

	def _fix_humaneval_canonical_solution(self, problem: Dict[str, Any]) -> str:
	"""HumanEval canonical solution 복원 (함수 시그니처 추가)"""

	canonical_code = problem.get('canonical_solution', '')
	entry_point = problem.get('entry_point', '')
	prompt = problem.get('prompt', '')

	# HumanEval인지 확인
	task_id = problem.get('task_id', '')
	if not task_id.startswith('HumanEval/'):
	return canonical_code

	# 이미 함수 시그니처가 있는지 확인
	if f"def {entry_point}" in canonical_code:
	return canonical_code

	try:
	# Prompt에서 함수 시그니처 추출
	import re
	def_pattern = rf'def\s+{re.escape(entry_point)}\s\([^)]\)[^:]*:'
	match = re.search(def_pattern, prompt, re.MULTILINE)

	if match:
	function_signature = match.group(0)

	# Import 문도 추출 (있다면)
	import_lines = []
	for line in prompt.split('\n'):
	stripped = line.strip()
	if (stripped.startswith('import ') or stripped.startswith('from ')) and not stripped.startswith('#'):
	import_lines.append(line)

	# 완전한 canonical solution 구성
	if import_lines:
	complete_canonical = '\n'.join(import_lines) + '\n\n' + function_signature + canonical_code
	else:
	complete_canonical = function_signature + canonical_code

	self.logger.log_info(f"🔧 Fixed HumanEval canonical solution for {entry_point}")
	return complete_canonical
	else:
	self.logger.log_warning(f"⚠️ Could not extract function signature for {entry_point}")
	return canonical_code

	except Exception as e:
	self.logger.log_error(f"Failed to fix HumanEval canonical solution: {e}")
	return canonical_code

	def _extract_single_prompt_example(self, problem: Dict[str, Any]) -> Optional[Tuple[str, str]]:
	"""🔧 새로운 메서드: 프롬프트의 단일 예시만 추출 (치팅 방지)"""

	try:
	# base_input의 첫 번째 항목을 단일 예시로 사용
	if 'base_input' in problem and problem['base_input']:
	first_input = problem['base_input'][0]
	entry_point = problem['entry_point']

	self.logger.log_info(f"📥 Using first base_input as single example: {first_input}")

	# 🔧 수정: HumanEval canonical solution 복원
	canonical_code = self._fix_humaneval_canonical_solution(problem)
	if canonical_code:
	actual_output = self._execute_llm_solution(canonical_code, entry_point, first_input)

	if actual_output is not None:
	# 입력 문자열 형식 생성
	if isinstance(first_input, list):
	if len(first_input) == 1 and isinstance(first_input[0], list):
	# [[args]] -> 단일 리스트 인자로 표시
	input_str = repr(first_input[0])
	elif len(first_input) == 1:
	# [단일인자] -> 단일인자
	input_str = repr(first_input[0])
	else:
	# [다중인자] -> 다중인자
	input_str = ', '.join(repr(arg) for arg in first_input)
	else:
	input_str = repr(first_input)

	result = (input_str, str(actual_output))
	self.logger.log_info(f"✅ Single example extracted: Input={input_str}, Output={actual_output}")
	return result
	else:
	self.logger.log_warning("❌ Failed to compute output with canonical solution")
	else:
	self.logger.log_warning("❌ No canonical solution available")
	else:
	self.logger.log_warning("❌ No base_input available")

	except Exception as e:
	self.logger.log_error(f"Single example extraction failed: {e}")

	return None

	def _extract_docstring_examples(self, prompt: str, func_name: str) -> List[Tuple[str, str]]:
	"""docstring에서 >>> 예제 추출"""

	examples = []
	lines = prompt.split('\n')

	i = 0
	while i < len(lines):
	line = lines[i].strip()
	# >>> func_name(...) 패턴 찾기
	if line.startswith('>>>') and func_name in line:
	# 입력 추출
	input_line = line[3:].strip() # >>> 제거

	# 다음 줄에서 출력 추출
	if i + 1 < len(lines):
	output_line = lines[i + 1].strip()
	# 출력이 >>> 로 시작하지 않으면 출력값
	if not output_line.startswith('>>>'):
	examples.append((input_line, output_line))
	i += 2
	continue
	i += 1
	else:
	i += 1

	return examples

	def _extract_test_cases(self, problem: Dict[str, Any], solution: str) -> List[Tuple[str, str]]:
	"""docstring의 예제에서 테스트 케이스 추출 (치팅 방지)"""

	test_cases = []
	func_name = problem.get('entry_point', 'unknown')
	problem_id = problem.get('task_id', '')

	# HumanEval과 MBPP 모두 docstring 예제만 사용
	self.logger.log_info(f"🎯 Extracting docstring examples for {problem_id}")

	# 프롬프트에서 docstring 예제 추출
	prompt = problem.get('prompt', '')
	examples = self._extract_docstring_examples(prompt, func_name)

	if examples:
	self.logger.log_info(f"📝 Found {len(examples)} docstring examples")
	for i, (input_str, expected_output) in enumerate(examples):
	try:
	# 입력 파싱 (func_name(args) 형태에서 args 추출)
	import ast
	# "func_name(args)" -> args 추출
	if input_str.startswith(func_name + '(') and input_str.endswith(')'):
	args_str = input_str[len(func_name)+1:-1]
	# 안전한 평가를 위해 ast.literal_eval 사용
	try:
	# 단일 인자인 경우
	input_args = ast.literal_eval(args_str)
	if not isinstance(input_args, tuple):
	input_args = (input_args,)
	except:
	# 여러 인자인 경우
	input_args = ast.literal_eval(f"({args_str})")

	# LLM 솔루션 실행
	actual_output = self._execute_llm_solution(solution, func_name, list(input_args))
	if actual_output is not None:
	test_cases.append((input_str, str(actual_output)))
	self.logger.log_info(f"✅ Example {i+1}: {input_str} -> {actual_output}")
	else:
	self.logger.log_warning(f"❌ Example {i+1} execution failed")

	except Exception as e:
	self.logger.log_error(f"Example {i+1} parsing failed: {e}")
	else:
	self.logger.log_warning(f"⚠️ No docstring examples found, falling back to first base_input")
	# docstring 예제가 없으면 첫 번째 base_input만 사용 (MBPP처럼)
	if 'base_input' in problem and problem['base_input']:
	inp_args = problem['base_input'][0]
	# 입력 문자열 생성
	if isinstance(inp_args, list):
	args_str = ', '.join(repr(arg) for arg in inp_args)
	input_str = f"{func_name}({args_str})"
	else:
	input_str = f"{func_name}({repr(inp_args)})"

	actual_output = self._execute_llm_solution(solution, func_name, inp_args)
	if actual_output is not None:
	test_cases.append((input_str, str(actual_output)))

	self.logger.log_info(f"📊 Extracted {len(test_cases)} test cases from docstring examples")
	return test_cases

	def _execute_llm_solution(self, llm_solution: str, func_name: str, input_args) -> Optional[str]:
	"""LLM 생성 솔루션을 실행하여 실제 출력 계산"""

	try:
	if not llm_solution or func_name == 'unknown':
	return None

	# 🔧 수정: 실행용 코드 구성 (MBPP+ 이중 리스트 처리)
	if isinstance(input_args, list):
	# MBPP+ 데이터가 이중 리스트로 감싸진 경우 처리
	if len(input_args) == 1 and isinstance(input_args[0], list):
	# [[args]] -> 단일 리스트 인자로 전달
	args_str = repr(input_args[0])
	elif len(input_args) == 1:
	# [단일인자] -> 단일 인자로 전달
	args_str = repr(input_args[0])
	else:
	# [다중인자] -> 다중 인자로 전달
	args_str = ', '.join(repr(arg) for arg in input_args)
	else:
	args_str = repr(input_args)

	execution_code = f"""
	{llm_solution}

	# Execute LLM solution
	try:
	result = {func_name}({args_str})
	print(repr(result))
	except Exception as e:
	print(f"EXECUTION_ERROR: {{e}}")
	"""

	# AZR Python Executor로 실행
	output, status = self.executor.apply(execution_code)

	if 'error' in status.lower() or 'EXECUTION_ERROR' in output:
	return None

	# 출력에서 결과 추출
	output_lines = output.strip().split('\n')
	if output_lines:
	result_line = output_lines[-1].strip()
	# repr()로 출력된 결과를 그대로 반환
	return result_line

	return None

	except Exception as e:
	self.logger.log_error(f"LLM solution execution failed: {e}")
	return None

	def _create_ipo_triple(self, solution: str, func_info: Dict[str, str],
	test_input: str, expected_output: str,
	triple_id: str, full_input_str: str = None) -> Optional[Dict[str, Any]]:
	"""IPO 트리플 생성 및 검증 (AZR Python Executor 사용)"""

	try:
	# 1. 솔루션 실행으로 실제 출력 확인
	actual_output = self._execute_function(solution, func_info['name'], test_input)

	if actual_output is None:
	return None

	# 2. IPO 트리플 구성
	triple = {
	'id': triple_id,
	'input': test_input, # 실제 인자만 저장 (예: "''", "3.5")
	'full_input_str': full_input_str or f"{func_info['name']}({test_input})", # 전체 입력 문자열은 별도 필드에
	'program': solution, # 이미 func_info['full_code']가 전달됨
	'expected_output': expected_output,
	'actual_output': actual_output,
	'function_name': func_info['name'],
	'function_args': func_info['args'],
	'is_correct': str(actual_output) == str(expected_output),
	'extraction_method': 'test_case'
	}

	return triple

	except Exception as e:
	self.logger.log_error(f"Triple creation failed for {triple_id}: {e}")
	return None

	def _execute_function(self, code: str, func_name: str, inputs: str) -> Optional[str]:
	"""AZR Python Executor로 함수 실행"""

	try:
	# 실행용 코드 구성 (AZR 템플릿 스타일)
	execution_code = f"""
	{code}

	# Execute function with inputs
	try:
	result = {func_name}({inputs})
	print(repr(result))
	except Exception as e:
	print(f"EXECUTION_ERROR: {{e}}")
	"""

	# AZR 방식으로 실행
	output, status = self.executor.apply(execution_code)

	if 'error' in status.lower() or 'EXECUTION_ERROR' in output:
	return None

	# 출력에서 결과 추출
	output_lines = output.strip().split('\n')
	if output_lines:
	return output_lines[-1].strip()

	return None

	except Exception as e:
	self.logger.log_error(f"Function execution failed: {e}")
	return None

	# 🔧 제거: Synthetic 트리플 생성 메서드들 제거
	# 단일 예시만 사용하여 치팅 방지 목적에 맞게 불필요한 메서드들 제거

	def _validate_triple(self, triple: Dict[str, Any]) -> bool:
	"""IPO 트리플 검증"""

	if not self.config.validate_triples:
	return True

	try:
	# 1. 기본 필드 존재 확인
	required_fields = ['input', 'program', 'expected_output', 'function_name']
	if not all(field in triple for field in required_fields):
	return False

	# 2. 코드 구문 검증
	try:
	ast.parse(triple['program'])
	except SyntaxError:
	return False

	# 3. 재실행으로 일관성 검증 (AZR 방식)
	# 이제 triple['input']은 이미 실제 인자만 포함
	actual_output = self._execute_function(
	triple['program'],
	triple['function_name'],
	triple['input']
	)

	if actual_output is None:
	return False

	# 4. 출력 일치 확인
	return str(actual_output) == str(triple['expected_output'])

	except Exception as e:
	self.logger.log_error(f"Triple validation failed: {e}")
	return False

	def get_triple_statistics(self) -> Dict[str, Any]:
	"""추출된 트리플 통계"""

	if not self.extracted_triples:
	return {"total": 0, "valid": 0, "invalid": 0}

	valid_count = sum(1 for triple in self.extracted_triples if triple.get('is_correct', False))

	return {
	"total": len(self.extracted_triples),
	"valid": valid_count,
	"invalid": len(self.extracted_triples) - valid_count,
	"extraction_methods": {
	"test_case": sum(1 for t in self.extracted_triples if t.get('extraction_method') == 'test_case'),
	"synthetic": sum(1 for t in self.extracted_triples if t.get('extraction_method') == 'synthetic')
	}
	}

	def generate_diverse_inputs(self, problem: Dict[str, Any], solution: str,
	existing_examples: List[Tuple[str, str]]) -> List[Dict[str, Any]]:
	"""LLM을 사용하여 다양한 입력 생성"""

	problem_id = problem.get('task_id', 'unknown')
	self.logger.log_info(f"🎲 Generating diverse inputs for {problem_id}")

	try:
	# 1. 함수 정보 추출
	entry_point = problem.get('entry_point', 'unknown')
	func_info = self._extract_function_info(solution, entry_point)
	if not func_info:
	self.logger.log_error("Failed to extract function info for input generation")
	return []

	# 2. 인자 타입 정보 추론
	arg_type_info = self._infer_argument_types(func_info, existing_examples, solution)

	# 3. 프롬프트 생성
	prompt = self._create_input_generation_prompt(
	problem_description=problem.get('prompt', ''),
	existing_examples=existing_examples,
	full_code=solution,
	arg_type_info=arg_type_info
	)

	# 4. LLM으로 입력 생성
	generated_inputs = self._call_llm_for_inputs(prompt, existing_examples, func_info, arg_type_info)

	# 5. 생성된 입력 검증
	valid_inputs = self._validate_generated_inputs(generated_inputs, func_info, solution)

	self.logger.log_info(f"✅ Generated {len(valid_inputs)} valid diverse inputs")
	return valid_inputs

	except Exception as e:
	self.logger.log_error(f"Failed to generate diverse inputs: {e}")
	return []

	def generate_diverse_inputs_batch(self, program_input_pairs: List[Dict[str, Any]]) -> Tuple[List[List[Dict[str, Any]]], List[Optional[Dict[str, Any]]]]:
	"""배치로 여러 프로그램의 diverse input 생성"""

	if not self.solution_generator:
	self.logger.log_error("Solution generator not set for batch processing")
	return [], []

	self.logger.log_info(f"🎲 Generating diverse inputs for {len(program_input_pairs)} programs (BATCH)")

	try:
	# 모든 프로그램의 입력 생성 프롬프트 생성
	batch_prompts = []
	program_contexts = []

	for pair in program_input_pairs:
	problem = pair['problem']
	solution = pair['solution']
	existing_examples = pair['existing_examples']

	# 함수 정보 추출
	entry_point = problem.get('entry_point', 'unknown')
	func_info = self._extract_function_info(solution, entry_point)
	if not func_info:
	program_contexts.append(None)
	batch_prompts.append("")
	continue

	# 인자 타입 정보 추론
	arg_type_info = self._infer_argument_types(func_info, existing_examples, solution)

	# 프롬프트 생성
	prompt = self._create_input_generation_prompt(
	problem_description=problem.get('prompt', ''),
	existing_examples=existing_examples,
	full_code=solution,
	arg_type_info=arg_type_info
	)

	batch_prompts.append(prompt)
	program_contexts.append({
	'func_info': func_info,
	'solution': solution,
	'problem': problem
	})

	# VLLM 배치로 LLM 호출
	if not batch_prompts or all(not p for p in batch_prompts):
	return [], []

	self.logger.log_info(f"🔍 Sending {len(batch_prompts)} prompts to VLLM for input generation")
	self.logger.log_info(f"🔍 First prompt preview: {batch_prompts[0][:200]}..." if batch_prompts else "No prompts")

	# Input generation은 코드 생성이 아니므로 후처리 없이 원시 응답 사용
	# generate_batch의 후처리(함수 추출 등)는 input generation에 부적합
	batch_responses = self.solution_generator._generate_batch_with_vllm(
	batch_prompts,
	temperature=0.7 # Input generation에는 약간의 랜덤성 필요
	)

	self.logger.log_info(f"🔍 Received {len(batch_responses)} responses from VLLM")
	for i, response in enumerate(batch_responses[:2]): # 처음 2개만 로깅
	self.logger.log_info(f"🔍 Response {i} preview: {response[:200]}...")

	# 각 응답을 파싱하여 입력 생성
	batch_results = []
	batch_generation_info = [] # 각 프로그램의 input generation 정보 저장

	for i, (response, context) in enumerate(zip(batch_responses, program_contexts)):
	if context is None:
	batch_results.append([])
	batch_generation_info.append(None)
	continue

	try:
	# 응답에서 입력 추출
	generated_inputs = self._parse_llm_input_response(
	response,
	context['func_info'],
	context['problem'].get('task_id', 'unknown')
	)

	# 디버깅: 파싱된 입력 개수 로깅
	self.logger.log_info(f"🔍 Parsed {len(generated_inputs)} inputs from response {i}")
	if generated_inputs:
	self.logger.log_info(f"🔍 First parsed input: {generated_inputs[0]}")

	# 생성된 입력 검증
	valid_inputs = self._validate_generated_inputs(
	generated_inputs,
	context['func_info'],
	context['solution']
	)

	# 디버깅: 검증 후 입력 개수 로깅
	self.logger.log_info(f"🔍 {len(valid_inputs)} inputs passed validation from response {i}")

	batch_results.append(valid_inputs)

	# Input generation 정보 저장
	generation_info = {
	'prompt': batch_prompts[i] if i < len(batch_prompts) else '',
	'llm_response': response,
	'extracted_inputs': generated_inputs,
	'valid_inputs': valid_inputs,
	'existing_examples': program_input_pairs[i]['existing_examples'] if i < len(program_input_pairs) else [],
	'function_info': context['func_info'],
	'arg_type_info': self._infer_argument_types(
	context['func_info'],
	program_input_pairs[i]['existing_examples'] if i < len(program_input_pairs) else [],
	context['solution']
	)
	}
	batch_generation_info.append(generation_info)

	except Exception as e:
	self.logger.log_error(f"Failed to process batch item {i}: {e}")
	# 더 자세한 디버깅 정보 추가
	self.logger.log_error(f"Response preview: {response[:200]}...")
	import traceback
	self.logger.log_error(f"Traceback: {traceback.format_exc()}")
	batch_results.append([])

	# 에러 정보도 저장
	batch_generation_info.append({
	'error': str(e),
	'prompt': batch_prompts[i] if i < len(batch_prompts) else '',
	'llm_response': response,
	'traceback': traceback.format_exc()
	})

	total_generated = sum(len(inputs) for inputs in batch_results)
	self.logger.log_info(f"✅ Generated {total_generated} diverse inputs across {len(program_input_pairs)} programs")

	# Return both inputs and generation info as a tuple
	return batch_results, batch_generation_info

	except Exception as e:
	self.logger.log_error(f"Batch input generation failed: {e}")
	return [], []

	def _parse_llm_input_response(self, llm_response: str, func_info: Dict[str, Any], problem_id: str) -> List[Dict[str, Any]]:
	"""LLM 응답에서 입력 예제 파싱"""

	self.logger.log_info(f"🔍 Parsing LLM response for {problem_id}, response length: {len(llm_response)}")

	try:
	# ```python ... ``` 블록에서 코드 추출
	import re
	code_pattern = r'```python\n(.*?)\n```'
	matches = re.findall(code_pattern, llm_response, re.DOTALL)

	if not matches:
	self.logger.log_info("🔍 No code block found, searching for examples = [")
	# 블록이 없으면 전체 응답에서 examples = 찾기
	if 'examples = [' in llm_response:
	start = llm_response.find('examples = [')
	# 균형잡힌 괄호 찾기
	bracket_count = 0
	end = start
	for i, char in enumerate(llm_response[start:]):
	if char == '[':
	bracket_count += 1
	elif char == ']':
	bracket_count -= 1
	if bracket_count == 0:
	end = start + i + 1
	break

	if end > start:
	code = llm_response[start:end]
	self.logger.log_info(f"🔍 Found examples code: {code[:100]}...")
	exec_globals = {}
	exec(code, exec_globals)
	examples = exec_globals.get('examples', [])
	self.logger.log_info(f"🔍 Extracted {len(examples)} examples")
	return examples
	else:
	self.logger.log_info("🔍 No 'examples = [' found in response")
	else:
	# 코드 블록에서 examples 추출
	self.logger.log_info(f"🔍 Found {len(matches)} code blocks")
	code = matches[0]
	self.logger.log_info(f"🔍 Code block preview: {code[:100]}...")
	exec_globals = {}
	exec(code, exec_globals)
	examples = exec_globals.get('examples', [])
	self.logger.log_info(f"🔍 Extracted {len(examples)} examples from code block")

	# examples가 dict가 아닌 경우 처리
	if examples and len(examples) > 0:
	self.logger.log_info(f"🔍 First example type: {type(examples[0])}")
	if isinstance(examples[0], dict):
	# expected_output, description 등 불필요한 키 제거
	cleaned_examples = []
	for ex in examples:
	cleaned = {k: v for k, v in ex.items()
	if k not in ['expected_output', 'description']}
	if cleaned: # 빈 dict가 아닌 경우만 추가
	cleaned_examples.append(cleaned)
	self.logger.log_info(f"🔍 Cleaned {len(cleaned_examples)} examples")
	return cleaned_examples

	return examples

	return []

	except Exception as e:
	self.logger.log_error(f"Failed to parse generated examples for {problem_id}: {e}")
	import traceback
	self.logger.log_error(f"Traceback: {traceback.format_exc()}")
	return []

	def _infer_argument_types(self, func_info: Dict[str, str],
	examples: List[Tuple[str, str]],
	solution: str) -> Dict[str, str]:
	"""기존 예제와 AST 분석으로 인자 타입 추론"""

	arg_types = {}
	func_name = func_info['name']
	arg_names = func_info['args']

	# 1. AST에서 type annotation 추출
	try:
	tree = ast.parse(solution)
	for node in ast.walk(tree):
	if isinstance(node, ast.FunctionDef) and node.name == func_name:
	for i, arg in enumerate(node.args.args):
	if i < len(arg_names) and arg.annotation:
	# Type annotation이 있는 경우
	arg_types[arg_names[i]] = ast.unparse(arg.annotation)
	except:
	pass

	# 2. 기존 예제에서 타입 추론
	if examples:
	for input_str, _ in examples:
	# "func_name(args)" 형태에서 args 추출
	if input_str.startswith(func_name + '(') and input_str.endswith(')'):
	args_str = input_str[len(func_name)+1:-1]
	try:
	# 인자 파싱
	parsed_args = eval(f"({args_str},)")
	if not isinstance(parsed_args, tuple):
	parsed_args = (parsed_args,)

	# 각 인자의 타입 추론
	for i, arg_value in enumerate(parsed_args):
	if i < len(arg_names):
	arg_name = arg_names[i]
	arg_type = type(arg_value).__name__

	# 특별한 케이스 처리
	if isinstance(arg_value, list):
	if arg_value and all(isinstance(x, type(arg_value[0])) for x in arg_value):
	inner_type = type(arg_value[0]).__name__
	arg_type = f"List[{inner_type}]"
	else:
	arg_type = "List"

	# 기존 타입과 병합
	if arg_name not in arg_types:
	arg_types[arg_name] = arg_type
	except:
	pass

	# 3. 타입 정보 딕셔너리로 반환
	# arg_types가 비어있으면 unknown 타입으로 채우기
	for arg_name in arg_names:
	if arg_name not in arg_types:
	arg_types[arg_name] = "Any (type unknown)"

	return arg_types

	def _create_input_generation_prompt(self, problem_description: str,
	existing_examples: List[Tuple[str, str]],
	full_code: str,
	arg_type_info: Dict[str, str]) -> str:
	"""입력 생성을 위한 프롬프트 생성"""

	# 모든 기존 예제를 포맷팅
	examples_text = ""
	for i, (input_str, output_str) in enumerate(existing_examples):
	examples_text += f"Example {i+1}:\n"
	examples_text += f"Input: {input_str}\n"
	examples_text += f"Output: {output_str}\n\n"

	# arg_type_info를 문자열로 포맷팅
	arg_type_text = "Argument types:\n"
	for arg, arg_type in arg_type_info.items():
	arg_type_text += f"- {arg}: {arg_type}\n"

	prompt = f"""Given the following problem description and its Python function implementation, first analyze the types and valid ranges of the function arguments, then write 5 different example inputs for the function that cover a diverse mix of typical (general) cases and edge/boundary cases.

	Problem Description:
	'''
	{problem_description}
	'''

	Existing Examples from Problem:
	{examples_text}

	Function Implementation:
	```python
	{full_code}
	```

	{arg_type_text}

	Based on the existing examples above, generate 5 NEW diverse test inputs that are different from the existing ones. Each input should be a Python dict where:
	- Keys are the exact parameter names from the function signature
	- Values are appropriate test values for each parameter

	Format your response as:
	```python
	examples = [
	{{dict_with_all_function_parameters}}, # Description of this test case
	{{dict_with_all_function_parameters}}, # Description of this test case
	... # Continue for all 5 examples
	]
	```

	Ensure your examples include:
	- At least 2 typical/general cases
	- At least 2 edge/boundary cases
	- 1 special case (empty, zero, maximum values, etc.)
	- All examples should be DIFFERENT from the existing examples shown above"""

	return prompt

	def _call_llm_for_inputs(self, prompt: str, existing_examples: List[Tuple[str, str]],
	func_info: Dict[str, Any], arg_type_info: str) -> List[Dict[str, Any]]:
	"""LLM을 호출하여 입력 생성 및 파싱"""

	# 프롬프트 저장
	self.last_generation_prompt = prompt

	try:
	# Input 생성용 전용 LLM 호출 (temperature=0.5)
	if self.model is not None and self.tokenizer is not None:
	# VLLM 사용 확인
	try:
	from vllm import LLM
	if isinstance(self.model, LLM):
	response = self._generate_with_vllm_for_inputs(prompt)
	else:
	response = self._generate_with_hf_for_inputs(prompt)
	except ImportError:
	response = self._generate_with_hf_for_inputs(prompt)

	# 응답 저장
	self.last_generation_response = response

	# 응답에서 examples 추출
	parsed_inputs = self._parse_generated_examples(response)

	# 입력 생성 정보 저장
	self.last_input_generation_info = {
	'prompt': prompt,
	'llm_response': response,
	'extracted_inputs': parsed_inputs,
	'existing_examples': existing_examples,
	'function_info': func_info,
	'arg_type_info': arg_type_info
	}

	return parsed_inputs
	else:
	# 모델이 없으면 빈 리스트 반환 (테스트 환경)
	self.logger.log_warning("No model available for input generation")
	self.last_generation_response = "No model available"

	# 실패한 경우에도 정보 저장
	self.last_input_generation_info = {
	'prompt': prompt,
	'llm_response': "No model available",
	'extracted_inputs': [],
	'existing_examples': existing_examples,
	'function_info': func_info,
	'arg_type_info': arg_type_info,
	'error': "No model available"
	}
	return []

	except Exception as e:
	self.logger.log_error(f"Failed to call LLM for inputs: {e}")
	self.last_generation_response = f"Error: {str(e)}"

	# 에러 발생 시에도 정보 저장
	self.last_input_generation_info = {
	'prompt': locals().get('prompt', 'N/A'),
	'llm_response': f"Error: {str(e)}",
	'extracted_inputs': [],
	'existing_examples': locals().get('existing_examples', []),
	'function_info': locals().get('func_info', {}),
	'arg_type_info': locals().get('arg_type_info', 'N/A'),
	'error': str(e)
	}
	return []

	def _generate_with_vllm_for_inputs(self, prompt: str) -> str:
	"""Input 생성용 VLLM 백엔드 (temperature=0.5로 다양성 확보)"""
	try:
	from vllm import SamplingParams

	# Input 생성용 높은 temperature 설정
	sampling_params = SamplingParams(
	temperature=0.5, # 다양한 입력 생성을 위한 높은 temperature
	max_tokens=2048,
	top_p=0.95, # 다양성을 위해 top_p 사용
	stop=["\n```\n"], # 코드 블록 종료 시 정지
	)

	outputs = self.model.generate([prompt], sampling_params, use_tqdm=False)
	return outputs[0].outputs[0].text.replace("\t", " ").strip()

	except Exception as e:
	self.logger.log_error(f"VLLM input generation failed: {e}")
	return ""

	def _generate_with_hf_for_inputs(self, prompt: str) -> str:
	"""Input 생성용 HuggingFace 백엔드 (temperature=0.5로 다양성 확보)"""
	try:
	import torch

	# 토크나이저 처리
	inputs = self.tokenizer(prompt, return_tensors='pt', truncation=True, max_length=4096)

	# attention mask 명시적으로 설정
	if 'attention_mask' not in inputs:
	inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])

	# 디바이스 이동
	inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

	with torch.no_grad():
	# 메모리 정리
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# Input 생성용 sampling 설정
	outputs = self.model.generate(
	inputs['input_ids'],
	attention_mask=inputs['attention_mask'],
	max_new_tokens=2048,
	do_sample=True, # sampling 활성화
	temperature=0.5, # 다양한 입력 생성을 위한 temperature
	top_p=0.95, # 다양성을 위해 top_p 사용
	pad_token_id=self.tokenizer.eos_token_id,
	eos_token_id=self.tokenizer.eos_token_id
	)

	# 응답 추출
	response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
	response = response[len(prompt):].strip()
	return response

	except Exception as e:
	self.logger.log_error(f"HuggingFace input generation failed: {e}")
	return ""

	def _parse_generated_examples(self, llm_response: str) -> List[Dict[str, Any]]:
	"""LLM 응답에서 예제 파싱"""

	try:
	# ```python ... ``` 블록에서 코드 추출
	import re
	code_pattern = r'```python\n(.*?)\n```'
	matches = re.findall(code_pattern, llm_response, re.DOTALL)

	if not matches:
	# 블록이 없으면 전체 응답에서 examples = 찾기
	if 'examples = [' in llm_response:
	start = llm_response.find('examples = [')
	# 균형잡힌 괄호 찾기
	bracket_count = 0
	end = start
	for i, char in enumerate(llm_response[start:]):
	if char == '[':
	bracket_count += 1
	elif char == ']':
	bracket_count -= 1
	if bracket_count == 0:
	end = start + i + 1
	break

	if end > start:
	code = llm_response[start:end]
	exec_globals = {}
	exec(code, exec_globals)
	return exec_globals.get('examples', [])
	else:
	# 코드 블록에서 examples 추출
	code = matches[0]
	exec_globals = {}
	exec(code, exec_globals)
	return exec_globals.get('examples', [])

	return []

	except Exception as e:
	self.logger.log_error(f"Failed to parse generated examples: {e}")
	return []

	def _validate_generated_inputs(self, generated_inputs: List[Dict[str, Any]],
	func_info: Dict[str, str],
	solution: str) -> List[Dict[str, Any]]:
	"""생성된 입력의 유효성 검증"""

	valid_inputs = []
	func_name = func_info['name']

	for i, input_dict in enumerate(generated_inputs):
	try:
	# 1. 필수 인자 확인
	required_args = set(func_info['args'])
	provided_args = set(input_dict.keys())

	if not required_args.issubset(provided_args):
	self.logger.log_warning(f"Input {i+1} missing required args: {required_args - provided_args}")
	continue

	# 2. 실제 실행으로 검증
	# 인자를 순서대로 배열
	args = [input_dict[arg] for arg in func_info['args'] if arg in input_dict]

	# 실행 테스트
	output = self._execute_llm_solution(solution, func_name, args)
	if output is not None:
	valid_inputs.append(input_dict)
	self.logger.log_info(f"✅ Valid input {i+1}: {input_dict}")
	else:
	self.logger.log_warning(f"❌ Input {i+1} execution failed")

	except Exception as e:
	self.logger.log_error(f"Input {i+1} validation error: {e}")

	return valid_inputs

	def create_ipo_from_input(self, problem: Dict[str, Any],
	solution: str,
	input_dict: Dict[str, Any]) -> Optional[Dict[str, Any]]:
	"""새로운 입력으로 IPO triple 생성"""

	try:
	problem_id = problem.get('task_id', 'unknown')
	entry_point = problem.get('entry_point', 'unknown')

	# 함수 정보 추출
	func_info = self._extract_function_info(solution, entry_point)
	if not func_info:
	return None

	# 인자를 순서대로 배열
	args = [input_dict[arg] for arg in func_info['args'] if arg in input_dict]

	# 실행하여 출력 얻기
	output = self._execute_llm_solution(solution, func_info['name'], args)
	if output is None:
	return None

	# 입력 문자열 생성
	args_str = ', '.join(repr(arg) for arg in args)
	full_input_str = f"{func_info['name']}({args_str})"

	# IPO triple 생성
	triple_id = f"{problem_id}_generated_{len(self.extracted_triples)}"

	triple = {
	'id': triple_id,
	'input': args_str, # 실제 인자만
	'full_input_str': full_input_str, # 전체 함수 호출
	'program': solution,
	'expected_output': output,
	'actual_output': output,
	'function_name': func_info['name'],
	'function_args': func_info['args'],
	'is_correct': True, # 생성된 것은 항상 정확
	'extraction_method': 'generated'
	}

	return triple

	except Exception as e:
	self.logger.log_error(f"Failed to create IPO from input: {e}")
	return None

	def cleanup(self):
	"""리소스 정리"""
	if hasattr(self.executor, 'cleanup'):
	self.executor.cleanup()