Spaces:

Pratap-K
/

SmartPayEnv

Sleeping

App Files Files Community

SmartPayEnv / server /preference_utils.py

Pratap-K

implement GRPO-style preference learning, simulation branching, and expanded documentation

27a0d2f 26 days ago

raw

history blame contribute delete

1.9 kB

	import numpy as np
	from typing import List, Tuple, Any

	def get_context_bucket(obs: Any) -> Tuple[int, int, int]:
	"""
	Discretizes the observation into a context bucket for preference learning.

	Args:
	obs: SmartpayenvObservation object or dict

	Returns:
	tuple: (bin_category, amount_bucket, risk_bucket)
	"""
	# Extract values whether obs is a class or dict
	if hasattr(obs, 'bin_category'):
	bin_cat = int(obs.bin_category)
	amount = float(obs.amount)
	risk = float(obs.observed_fraud_risk)
	else:
	bin_cat = int(obs.get('bin_category', 0))
	amount = float(obs.get('amount', 0))
	risk = float(obs.get('observed_fraud_risk', 0))

	return (
	bin_cat,
	int(amount // 500), # Bucket amounts by $500
	int(np.clip(risk * 5, 0, 4)) # Risk buckets 0–4
	)

	def calculate_advantages(results: List[Tuple[Any, float]], baseline: float = 0.5) -> List[Tuple[Any, float]]:
	"""
	Calculates standardized advantage scores from simulation results.

	Args:
	results: List of (action, reward) tuples
	baseline: Neutral reward baseline

	Returns:
	List of (action, advantage) tuples
	"""
	if not results:
	return []

	scores = [r for _, r in results]

	if len(scores) < 2:
	# If only one action, advantage is relative to baseline
	return [(results[0][0], results[0][1] - baseline)]

	mean = np.mean(scores)
	std = np.std(scores) + 1e-6 # Avoid div by zero

	return [(a, (r - mean) / std) for (a, r) in results]

	def rank_actions(results: List[Tuple[Any, float]]) -> List[Tuple[Any, int]]:
	"""
	Ranks actions by reward (higher index = better).
	"""
	sorted_results = sorted(results, key=lambda x: x[1])
	return [(a, i) for i, (a, _) in enumerate(sorted_results)]