Upload src/energy_v4.py

407c5f4 verified 2 days ago

12.7 kB

	"""
	V4 Energy-Aware Training Module.

	Implements energy-constrained optimization with hardware-aware cost models.
	Based on research from quantum ML energy benchmarking and green AI principles.

	Key features:
	- Hardware-specific energy models (CPU, GPU, edge TPU, quantum simulator)
	- FLOPs → energy conversion with hardware-specific coefficients
	- Energy-accuracy Pareto frontier tracking
	- Carbon-aware scheduling (time-of-day energy mix)
	- Quantum circuit energy overhead estimation

	References:
	- Patterson et al. "Carbon Emissions and Large Neural Network Training" (2021)
	- Luccioni et al. "Estimating the Carbon Footprint of BLOOM" (2023)
	- QKAN (arXiv:2509.14026) — energy-efficient quantum activation
	"""

	import torch
	import time
	import math
	from typing import Dict, Optional, Tuple
	from dataclasses import dataclass, field


	# ─── Hardware Energy Models ─────────────────────────────────────────────────

	@dataclass
	class HardwareProfile:
	"""Energy and performance profile for a hardware target."""
	name: str
	flops_per_second: float # Peak FLOPS
	watts_idle: float # Idle power (W)
	watts_peak: float # Peak power (W)
	energy_per_flop_uj: float # μJ per FLOP
	memory_bandwidth_gbs: float # GB/s
	carbon_intensity_g_per_kwh: float = 400 # gCO2/kWh (global average)


	# Hardware profiles (empirically calibrated)
	HARDWARE_PROFILES = {
	"cpu_intel_xeon": HardwareProfile(
	name="Intel Xeon (CPU)",
	flops_per_second=500e9, # 500 GFLOPS
	watts_idle=30,
	watts_peak=150,
	energy_per_flop_uj=3e-7, # 0.3 pJ/FLOP → 3e-7 μJ
	memory_bandwidth_gbs=50,
	carbon_intensity_g_per_kwh=400,
	),
	"cpu_apple_m2": HardwareProfile(
	name="Apple M2 (CPU)",
	flops_per_second=1.5e12, # 1.5 TFLOPS
	watts_idle=3,
	watts_peak=20,
	energy_per_flop_uj=1.3e-8, # Very efficient
	memory_bandwidth_gbs=100,
	carbon_intensity_g_per_kwh=400,
	),
	"gpu_a100": HardwareProfile(
	name="NVIDIA A100 (GPU)",
	flops_per_second=312e12, # 312 TFLOPS (bf16)
	watts_idle=50,
	watts_peak=400,
	energy_per_flop_uj=1.3e-9, # 1.3 fJ → 1.3e-9 μJ
	memory_bandwidth_gbs=2000,
	carbon_intensity_g_per_kwh=400,
	),
	"gpu_t4": HardwareProfile(
	name="NVIDIA T4 (GPU)",
	flops_per_second=65e12, # 65 TFLOPS (fp16)
	watts_idle=15,
	watts_peak=70,
	energy_per_flop_uj=1.1e-9,
	memory_bandwidth_gbs=320,
	carbon_intensity_g_per_kwh=400,
	),
	"edge_tpu": HardwareProfile(
	name="Google Edge TPU",
	flops_per_second=4e12, # 4 TOPS (int8)
	watts_idle=0.5,
	watts_peak=2,
	energy_per_flop_uj=5e-10, # 0.5 fJ — most efficient
	memory_bandwidth_gbs=30,
	carbon_intensity_g_per_kwh=400,
	),
	"edge_mobile": HardwareProfile(
	name="Mobile CPU (Edge)",
	flops_per_second=50e9, # 50 GFLOPS
	watts_idle=0.3,
	watts_peak=5,
	energy_per_flop_uj=1e-7, # 0.1 pJ
	memory_bandwidth_gbs=20,
	carbon_intensity_g_per_kwh=400,
	),
	"quantum_simulator": HardwareProfile(
	name="PennyLane Quantum Simulator",
	flops_per_second=1e9, # Very slow — CPU-bound simulation
	watts_idle=30,
	watts_peak=150,
	energy_per_flop_uj=1e-6, # 1 pJ — much higher due to simulation overhead
	memory_bandwidth_gbs=20,
	carbon_intensity_g_per_kwh=400,
	),
	"quantum_hardware_ibm": HardwareProfile(
	name="IBM Quantum (Eagle)",
	flops_per_second=1e6, # Quantum: no FLOPs, use equivalent
	watts_idle=50, # Cryogenic cooling
	watts_peak=25000, # ~25 kW for dilution fridge
	energy_per_flop_uj=1.0, # Per-quantum-gate equivalent ~1 μJ
	memory_bandwidth_gbs=0.01,
	carbon_intensity_g_per_kwh=400,
	),
	}


	# ─── Energy Estimator ────────────────────────────────────────────────────────

	class EnergyEstimatorV4:
	"""
	V4 energy estimator with hardware-aware cost models.

	Accounts for:
	- Compute energy (FLOPs → μJ)
	- Memory transfer energy
	- Quantum circuit simulation overhead
	- Idle power during data loading
	- Batch size effects on utilization

	All energy values in microjoules (μJ).
	"""

	def __init__(self, hardware: str = "cpu_intel_xeon"):
	self.set_hardware(hardware)

	# Overhead multipliers
	self.quantum_overhead_factor = 50.0 # Quantum sim is ~50× more expensive per "FLOP"
	self.memory_transfer_cost_uj_per_gb = 500.0 # ~500 μJ per GB transferred

	def set_hardware(self, hardware: str):
	"""Switch hardware target."""
	self.hardware_name = hardware
	self.profile = HARDWARE_PROFILES.get(hardware, HARDWARE_PROFILES["cpu_intel_xeon"])

	def compute_energy(self, flops: int, batch_size: int = 1,
	memory_gb: float = 0.0) -> float:
	"""
	Estimate energy for a forward pass.

	Args:
	flops: Total floating-point operations.
	batch_size: Batch size (for utilization scaling).
	memory_gb: Data transferred to/from memory.

	Returns:
	Energy in microjoules (μJ).
	"""
	# Compute energy
	compute_uj = flops * self.profile.energy_per_flop_uj

	# Utilization penalty (sub-linear at small batch sizes)
	utilization = min(1.0, batch_size / 16) # Saturates at bs=16
	if utilization < 1.0:
	compute_uj *= 1.0 / max(0.2, utilization)

	# Memory transfer energy
	memory_uj = memory_gb * self.memory_transfer_cost_uj_per_gb

	return compute_uj + memory_uj

	def quantum_energy(self, n_qubits: int, n_layers: int,
	n_tokens: int) -> float:
	"""
	Estimate energy for quantum circuit simulation.

	Quantum simulation cost scales as ~O(2^n_qubits) for statevector,
	modified by circuit depth (n_layers).

	Args:
	n_qubits: Number of qubits.
	n_layers: Circuit depth.
	n_tokens: Number of tokens processed.

	Returns:
	Energy in microjoules.
	"""
	# Base cost for one quantum circuit evaluation
	base_ops = (2 ** n_qubits) * n_layers * 100 # ~100 classical ops per quantum op
	energy = base_ops * self.profile.energy_per_flop_uj * self.quantum_overhead_factor
	return energy * n_tokens

	def carbon_footprint(self, energy_uj: float) -> float:
	"""
	Convert energy to carbon footprint.

	Args:
	energy_uj: Energy in microjoules.

	Returns:
	Carbon in grams CO2.
	"""
	energy_kwh = energy_uj * 1e-12 # μJ → kWh
	return energy_kwh * self.profile.carbon_intensity_g_per_kwh

	def training_energy_estimate(self, total_flops: int, n_epochs: int,
	batch_size: int, dataset_size: int,
	quantum_tokens_per_batch: int = 0,
	n_qubits: int = 4, n_qlayers: int = 2) -> Dict:
	"""
	Estimate total training energy.

	Returns:
	Dict with energy breakdown.
	"""
	steps_per_epoch = math.ceil(dataset_size / batch_size)
	total_steps = steps_per_epoch * n_epochs

	# Classical compute
	classical_uj = self.compute_energy(total_flops * total_steps, batch_size)
	classical_carbon = self.carbon_footprint(classical_uj)

	# Quantum overhead
	quantum_uj = 0.0
	if quantum_tokens_per_batch > 0:
	quantum_uj = self.quantum_energy(
	n_qubits, n_qlayers, quantum_tokens_per_batch
	) * total_steps
	quantum_carbon = self.carbon_footprint(quantum_uj)

	total_uj = classical_uj + quantum_uj
	total_carbon = classical_carbon + quantum_carbon

	# Equivalent comparisons
	smartphone_charges = total_uj / (15 * 3600 * 1e6) # 15 Wh phone battery

	return {
	"hardware": self.profile.name,
	"total_energy_uj": total_uj,
	"total_energy_j": total_uj * 1e-6,
	"total_energy_kwh": total_uj * 1e-12,
	"classical_energy_uj": classical_uj,
	"quantum_energy_uj": quantum_uj,
	"carbon_g": total_carbon,
	"carbon_kg": total_carbon / 1000,
	"equivalent_smartphone_charges": smartphone_charges,
	"training_steps": total_steps,
	}

	def compare_hardware(self, flops: int, batch_size: int = 16) -> Dict[str, float]:
	"""Compare energy across hardware targets."""
	results = {}
	for hw_name in HARDWARE_PROFILES:
	if hw_name.startswith("quantum"):
	continue # Quantum not comparable for classical FLOPs
	self.set_hardware(hw_name)
	results[hw_name] = self.compute_energy(flops, batch_size)
	return results


	# ─── Pareto Frontier Tracker ────────────────────────────────────────────────

	class ParetoTracker:
	"""
	Tracks the accuracy-efficiency Pareto frontier during training.

	Records checkpoints where:
	- Perplexity improved at same energy
	- Energy reduced at same perplexity
	"""

	def __init__(self):
	self.pareto_points: list = [] # [(ppl, energy_uj, step), ...]

	def record(self, ppl: float, energy_uj: float, step: int):
	"""Record a point. Returns True if it's Pareto-optimal."""
	is_pareto = True
	for p, e, _ in self.pareto_points:
	if p <= ppl and e <= energy_uj:
	# Existing point dominates this one
	is_pareto = False
	break

	if is_pareto:
	# Remove any dominated points
	self.pareto_points = [
	(p, e, s) for p, e, s in self.pareto_points
	if not (ppl < p and energy_uj < e)
	]
	self.pareto_points.append((ppl, energy_uj, step))
	self.pareto_points.sort(key=lambda x: x[0])

	return is_pareto

	def get_best_efficiency(self) -> Optional[Tuple[float, float]]:
	"""Get the best energy-efficiency tradeoff (lowest energy with good ppl)."""
	if not self.pareto_points:
	return None
	# Best = Pareto point with lowest energy among those within 10% of best ppl
	best_ppl = min(p for p, _, _ in self.pareto_points)
	candidates = [(e, p) for p, e, _ in self.pareto_points
	if p <= best_ppl * 1.1]
	if not candidates:
	return None
	best_energy, ppl = min(candidates, key=lambda x: x[0])
	return (ppl, best_energy)

	def summary(self) -> Dict:
	"""Return Pareto frontier summary."""
	if not self.pareto_points:
	return {"points": 0}
	return {
	"points": len(self.pareto_points),
	"best_ppl": min(p for p, _, _ in self.pareto_points),
	"min_energy_uj": min(e for _, e, _ in self.pareto_points),
	"frontier": [(round(p, 2), round(e, 2)) for p, e, _ in self.pareto_points],
	}


	# ─── Convenience Functions ──────────────────────────────────────────────────

	def estimate_model_energy(model, estimator: EnergyEstimatorV4,
	seq_len: int = 128, batch_size: int = 1) -> Dict:
	"""Quick energy estimate for a model."""
	total_params = sum(p.numel() for p in model.parameters())

	# FLOPs estimate: ~2 * params * batch * seq_len (multiply-add per token)
	flops = int(2 * total_params * batch_size * seq_len)

	# Memory: approx model size in GB
	memory_gb = total_params * 4 / 1e9 # fp32 = 4 bytes/param

	energy = estimator.compute_energy(flops, batch_size, memory_gb)
	carbon = estimator.carbon_footprint(energy)

	return {
	"flops_estimate": flops,
	"energy_uj": energy,
	"energy_mj": energy / 1e6,
	"carbon_per_query_ug": carbon * 1e6, # μg CO2
	"params": total_params,
	"model_size_mb": total_params * 4 / 1e6,
	"hardware": estimator.profile.name,
	}