Spaces:
Sleeping
Sleeping
| import numpy as np | |
| from typing import List, Tuple, Any | |
| def get_context_bucket(obs: Any) -> Tuple[int, int, int]: | |
| """ | |
| Discretizes the observation into a context bucket for preference learning. | |
| Args: | |
| obs: SmartpayenvObservation object or dict | |
| Returns: | |
| tuple: (bin_category, amount_bucket, risk_bucket) | |
| """ | |
| # Extract values whether obs is a class or dict | |
| if hasattr(obs, 'bin_category'): | |
| bin_cat = int(obs.bin_category) | |
| amount = float(obs.amount) | |
| risk = float(obs.observed_fraud_risk) | |
| else: | |
| bin_cat = int(obs.get('bin_category', 0)) | |
| amount = float(obs.get('amount', 0)) | |
| risk = float(obs.get('observed_fraud_risk', 0)) | |
| return ( | |
| bin_cat, | |
| int(amount // 500), # Bucket amounts by $500 | |
| int(np.clip(risk * 5, 0, 4)) # Risk buckets 0–4 | |
| ) | |
| def calculate_advantages(results: List[Tuple[Any, float]], baseline: float = 0.5) -> List[Tuple[Any, float]]: | |
| """ | |
| Calculates standardized advantage scores from simulation results. | |
| Args: | |
| results: List of (action, reward) tuples | |
| baseline: Neutral reward baseline | |
| Returns: | |
| List of (action, advantage) tuples | |
| """ | |
| if not results: | |
| return [] | |
| scores = [r for _, r in results] | |
| if len(scores) < 2: | |
| # If only one action, advantage is relative to baseline | |
| return [(results[0][0], results[0][1] - baseline)] | |
| mean = np.mean(scores) | |
| std = np.std(scores) + 1e-6 # Avoid div by zero | |
| return [(a, (r - mean) / std) for (a, r) in results] | |
| def rank_actions(results: List[Tuple[Any, float]]) -> List[Tuple[Any, int]]: | |
| """ | |
| Ranks actions by reward (higher index = better). | |
| """ | |
| sorted_results = sorted(results, key=lambda x: x[1]) | |
| return [(a, i) for i, (a, _) in enumerate(sorted_results)] | |