Training pipeline scripts

32a197f verified 18 days ago

4.33 kB

	"""
	KernelX Intelligence Layer — RL Environment (OpenEnv structure)

	Provides reset/step interface for training the Strategist policy via GRPO.
	Replays recorded transitions from the preprocessed JSONL dataset and
	computes multi-objective rewards.
	"""

	import json
	import random
	from dataclasses import dataclass, field
	from typing import List, Tuple

	from .rewards import RewardComputer


	@dataclass
	class KernelState:
	"""Observation wrapper for the RL environment."""
	features: List[float] # active features (10D after preprocessing)
	pid: int
	cpu: int
	timestep: int
	prev_action: float


	@dataclass
	class KernelAction:
	"""Action output from the Strategist."""
	value: float # scheduling weight in [-1.0, 1.0]


	class KernelSchedulerEnv:
	"""Offline RL environment that replays recorded kernel transitions.

	Each episode starts at a random position in the dataset and runs for
	max_steps transitions. The reward is computed from the multi-objective
	RewardComputer.
	"""

	def __init__(
	self,
	data_path: str = "training/data/train.jsonl",
	max_steps: int = 10,
	alpha: float = 1.0,
	beta: float = 2.0,
	gamma: float = 0.5,
	):
	self.records = [json.loads(l) for l in open(data_path) if l.strip()]
	self.max_steps = max_steps
	self.reward_computer = RewardComputer(alpha=alpha, beta=beta, gamma=gamma)

	# Episode state
	self.timestep = 0
	self.current_idx = 0
	self.prev_action = 0.0

	if len(self.records) < max_steps + 1:
	raise ValueError(
	f"Dataset has {len(self.records)} records but max_steps={max_steps} "
	f"requires at least {max_steps + 1}"
	)

	def reset(self) -> KernelState:
	"""Start a fresh episode from a random point in the dataset."""
	self.timestep = 0
	self.current_idx = random.randint(0, len(self.records) - self.max_steps - 1)
	self.prev_action = 0.0
	return self._get_state()

	def step(self, action: KernelAction) -> Tuple[KernelState, dict, bool]:
	"""Apply action, compute reward, advance to next state.

	Returns:
	next_state: The new KernelState after the transition
	reward_breakdown: Dict with 'total' and per-component rewards
	done: Whether the episode has ended
	"""
	current = self.records[self.current_idx + self.timestep]
	next_idx = self.current_idx + self.timestep + 1
	next_rec = self.records[next_idx] if next_idx < len(self.records) else current

	reward_breakdown = self.reward_computer.compute_total(
	state=current["state"],
	action=action,
	prev_action=self.prev_action,
	next_state=next_rec["state"],
	)

	self.timestep += 1
	self.prev_action = action.value
	done = self.timestep >= self.max_steps

	return self._get_state(), reward_breakdown, done

	def _get_state(self) -> KernelState:
	"""Read the current state from the dataset."""
	rec = self.records[self.current_idx + self.timestep]
	return KernelState(
	features=rec["state"],
	pid=rec["pid"],
	cpu=rec["cpu"],
	timestep=self.timestep,
	prev_action=self.prev_action,
	)

	def simulate(self, state_features: list, action_value: float) -> list:
	"""Lightweight next-state lookup for reward_fn during GRPO.

	Finds the nearest recorded state in the dataset and returns
	its recorded next_state. This is a simple approximation;
	the World Model provides higher-fidelity simulation.
	"""
	import numpy as np

	state_arr = np.array(state_features)
	best_dist = float("inf")
	best_next = state_features # fallback

	# Sample a subset to keep this fast
	sample_size = min(500, len(self.records))
	indices = random.sample(range(len(self.records)), sample_size)

	for idx in indices:
	rec = self.records[idx]
	dist = float(np.linalg.norm(state_arr - np.array(rec["state"])))
	if dist < best_dist:
	best_dist = dist
	best_next = rec["next_state"]

	return best_next