Q-TensorFormer / src /energy_v4.py
Premchan369's picture
Upload src/energy_v4.py
407c5f4 verified
"""
V4 Energy-Aware Training Module.
Implements energy-constrained optimization with hardware-aware cost models.
Based on research from quantum ML energy benchmarking and green AI principles.
Key features:
- Hardware-specific energy models (CPU, GPU, edge TPU, quantum simulator)
- FLOPs β†’ energy conversion with hardware-specific coefficients
- Energy-accuracy Pareto frontier tracking
- Carbon-aware scheduling (time-of-day energy mix)
- Quantum circuit energy overhead estimation
References:
- Patterson et al. "Carbon Emissions and Large Neural Network Training" (2021)
- Luccioni et al. "Estimating the Carbon Footprint of BLOOM" (2023)
- QKAN (arXiv:2509.14026) β€” energy-efficient quantum activation
"""
import torch
import time
import math
from typing import Dict, Optional, Tuple
from dataclasses import dataclass, field
# ─── Hardware Energy Models ─────────────────────────────────────────────────
@dataclass
class HardwareProfile:
"""Energy and performance profile for a hardware target."""
name: str
flops_per_second: float # Peak FLOPS
watts_idle: float # Idle power (W)
watts_peak: float # Peak power (W)
energy_per_flop_uj: float # ΞΌJ per FLOP
memory_bandwidth_gbs: float # GB/s
carbon_intensity_g_per_kwh: float = 400 # gCO2/kWh (global average)
# Hardware profiles (empirically calibrated)
HARDWARE_PROFILES = {
"cpu_intel_xeon": HardwareProfile(
name="Intel Xeon (CPU)",
flops_per_second=500e9, # 500 GFLOPS
watts_idle=30,
watts_peak=150,
energy_per_flop_uj=3e-7, # 0.3 pJ/FLOP β†’ 3e-7 ΞΌJ
memory_bandwidth_gbs=50,
carbon_intensity_g_per_kwh=400,
),
"cpu_apple_m2": HardwareProfile(
name="Apple M2 (CPU)",
flops_per_second=1.5e12, # 1.5 TFLOPS
watts_idle=3,
watts_peak=20,
energy_per_flop_uj=1.3e-8, # Very efficient
memory_bandwidth_gbs=100,
carbon_intensity_g_per_kwh=400,
),
"gpu_a100": HardwareProfile(
name="NVIDIA A100 (GPU)",
flops_per_second=312e12, # 312 TFLOPS (bf16)
watts_idle=50,
watts_peak=400,
energy_per_flop_uj=1.3e-9, # 1.3 fJ β†’ 1.3e-9 ΞΌJ
memory_bandwidth_gbs=2000,
carbon_intensity_g_per_kwh=400,
),
"gpu_t4": HardwareProfile(
name="NVIDIA T4 (GPU)",
flops_per_second=65e12, # 65 TFLOPS (fp16)
watts_idle=15,
watts_peak=70,
energy_per_flop_uj=1.1e-9,
memory_bandwidth_gbs=320,
carbon_intensity_g_per_kwh=400,
),
"edge_tpu": HardwareProfile(
name="Google Edge TPU",
flops_per_second=4e12, # 4 TOPS (int8)
watts_idle=0.5,
watts_peak=2,
energy_per_flop_uj=5e-10, # 0.5 fJ β€” most efficient
memory_bandwidth_gbs=30,
carbon_intensity_g_per_kwh=400,
),
"edge_mobile": HardwareProfile(
name="Mobile CPU (Edge)",
flops_per_second=50e9, # 50 GFLOPS
watts_idle=0.3,
watts_peak=5,
energy_per_flop_uj=1e-7, # 0.1 pJ
memory_bandwidth_gbs=20,
carbon_intensity_g_per_kwh=400,
),
"quantum_simulator": HardwareProfile(
name="PennyLane Quantum Simulator",
flops_per_second=1e9, # Very slow β€” CPU-bound simulation
watts_idle=30,
watts_peak=150,
energy_per_flop_uj=1e-6, # 1 pJ β€” much higher due to simulation overhead
memory_bandwidth_gbs=20,
carbon_intensity_g_per_kwh=400,
),
"quantum_hardware_ibm": HardwareProfile(
name="IBM Quantum (Eagle)",
flops_per_second=1e6, # Quantum: no FLOPs, use equivalent
watts_idle=50, # Cryogenic cooling
watts_peak=25000, # ~25 kW for dilution fridge
energy_per_flop_uj=1.0, # Per-quantum-gate equivalent ~1 ΞΌJ
memory_bandwidth_gbs=0.01,
carbon_intensity_g_per_kwh=400,
),
}
# ─── Energy Estimator ────────────────────────────────────────────────────────
class EnergyEstimatorV4:
"""
V4 energy estimator with hardware-aware cost models.
Accounts for:
- Compute energy (FLOPs β†’ ΞΌJ)
- Memory transfer energy
- Quantum circuit simulation overhead
- Idle power during data loading
- Batch size effects on utilization
All energy values in microjoules (ΞΌJ).
"""
def __init__(self, hardware: str = "cpu_intel_xeon"):
self.set_hardware(hardware)
# Overhead multipliers
self.quantum_overhead_factor = 50.0 # Quantum sim is ~50Γ— more expensive per "FLOP"
self.memory_transfer_cost_uj_per_gb = 500.0 # ~500 ΞΌJ per GB transferred
def set_hardware(self, hardware: str):
"""Switch hardware target."""
self.hardware_name = hardware
self.profile = HARDWARE_PROFILES.get(hardware, HARDWARE_PROFILES["cpu_intel_xeon"])
def compute_energy(self, flops: int, batch_size: int = 1,
memory_gb: float = 0.0) -> float:
"""
Estimate energy for a forward pass.
Args:
flops: Total floating-point operations.
batch_size: Batch size (for utilization scaling).
memory_gb: Data transferred to/from memory.
Returns:
Energy in microjoules (ΞΌJ).
"""
# Compute energy
compute_uj = flops * self.profile.energy_per_flop_uj
# Utilization penalty (sub-linear at small batch sizes)
utilization = min(1.0, batch_size / 16) # Saturates at bs=16
if utilization < 1.0:
compute_uj *= 1.0 / max(0.2, utilization)
# Memory transfer energy
memory_uj = memory_gb * self.memory_transfer_cost_uj_per_gb
return compute_uj + memory_uj
def quantum_energy(self, n_qubits: int, n_layers: int,
n_tokens: int) -> float:
"""
Estimate energy for quantum circuit simulation.
Quantum simulation cost scales as ~O(2^n_qubits) for statevector,
modified by circuit depth (n_layers).
Args:
n_qubits: Number of qubits.
n_layers: Circuit depth.
n_tokens: Number of tokens processed.
Returns:
Energy in microjoules.
"""
# Base cost for one quantum circuit evaluation
base_ops = (2 ** n_qubits) * n_layers * 100 # ~100 classical ops per quantum op
energy = base_ops * self.profile.energy_per_flop_uj * self.quantum_overhead_factor
return energy * n_tokens
def carbon_footprint(self, energy_uj: float) -> float:
"""
Convert energy to carbon footprint.
Args:
energy_uj: Energy in microjoules.
Returns:
Carbon in grams CO2.
"""
energy_kwh = energy_uj * 1e-12 # ΞΌJ β†’ kWh
return energy_kwh * self.profile.carbon_intensity_g_per_kwh
def training_energy_estimate(self, total_flops: int, n_epochs: int,
batch_size: int, dataset_size: int,
quantum_tokens_per_batch: int = 0,
n_qubits: int = 4, n_qlayers: int = 2) -> Dict:
"""
Estimate total training energy.
Returns:
Dict with energy breakdown.
"""
steps_per_epoch = math.ceil(dataset_size / batch_size)
total_steps = steps_per_epoch * n_epochs
# Classical compute
classical_uj = self.compute_energy(total_flops * total_steps, batch_size)
classical_carbon = self.carbon_footprint(classical_uj)
# Quantum overhead
quantum_uj = 0.0
if quantum_tokens_per_batch > 0:
quantum_uj = self.quantum_energy(
n_qubits, n_qlayers, quantum_tokens_per_batch
) * total_steps
quantum_carbon = self.carbon_footprint(quantum_uj)
total_uj = classical_uj + quantum_uj
total_carbon = classical_carbon + quantum_carbon
# Equivalent comparisons
smartphone_charges = total_uj / (15 * 3600 * 1e6) # 15 Wh phone battery
return {
"hardware": self.profile.name,
"total_energy_uj": total_uj,
"total_energy_j": total_uj * 1e-6,
"total_energy_kwh": total_uj * 1e-12,
"classical_energy_uj": classical_uj,
"quantum_energy_uj": quantum_uj,
"carbon_g": total_carbon,
"carbon_kg": total_carbon / 1000,
"equivalent_smartphone_charges": smartphone_charges,
"training_steps": total_steps,
}
def compare_hardware(self, flops: int, batch_size: int = 16) -> Dict[str, float]:
"""Compare energy across hardware targets."""
results = {}
for hw_name in HARDWARE_PROFILES:
if hw_name.startswith("quantum"):
continue # Quantum not comparable for classical FLOPs
self.set_hardware(hw_name)
results[hw_name] = self.compute_energy(flops, batch_size)
return results
# ─── Pareto Frontier Tracker ────────────────────────────────────────────────
class ParetoTracker:
"""
Tracks the accuracy-efficiency Pareto frontier during training.
Records checkpoints where:
- Perplexity improved at same energy
- Energy reduced at same perplexity
"""
def __init__(self):
self.pareto_points: list = [] # [(ppl, energy_uj, step), ...]
def record(self, ppl: float, energy_uj: float, step: int):
"""Record a point. Returns True if it's Pareto-optimal."""
is_pareto = True
for p, e, _ in self.pareto_points:
if p <= ppl and e <= energy_uj:
# Existing point dominates this one
is_pareto = False
break
if is_pareto:
# Remove any dominated points
self.pareto_points = [
(p, e, s) for p, e, s in self.pareto_points
if not (ppl < p and energy_uj < e)
]
self.pareto_points.append((ppl, energy_uj, step))
self.pareto_points.sort(key=lambda x: x[0])
return is_pareto
def get_best_efficiency(self) -> Optional[Tuple[float, float]]:
"""Get the best energy-efficiency tradeoff (lowest energy with good ppl)."""
if not self.pareto_points:
return None
# Best = Pareto point with lowest energy among those within 10% of best ppl
best_ppl = min(p for p, _, _ in self.pareto_points)
candidates = [(e, p) for p, e, _ in self.pareto_points
if p <= best_ppl * 1.1]
if not candidates:
return None
best_energy, ppl = min(candidates, key=lambda x: x[0])
return (ppl, best_energy)
def summary(self) -> Dict:
"""Return Pareto frontier summary."""
if not self.pareto_points:
return {"points": 0}
return {
"points": len(self.pareto_points),
"best_ppl": min(p for p, _, _ in self.pareto_points),
"min_energy_uj": min(e for _, e, _ in self.pareto_points),
"frontier": [(round(p, 2), round(e, 2)) for p, e, _ in self.pareto_points],
}
# ─── Convenience Functions ──────────────────────────────────────────────────
def estimate_model_energy(model, estimator: EnergyEstimatorV4,
seq_len: int = 128, batch_size: int = 1) -> Dict:
"""Quick energy estimate for a model."""
total_params = sum(p.numel() for p in model.parameters())
# FLOPs estimate: ~2 * params * batch * seq_len (multiply-add per token)
flops = int(2 * total_params * batch_size * seq_len)
# Memory: approx model size in GB
memory_gb = total_params * 4 / 1e9 # fp32 = 4 bytes/param
energy = estimator.compute_energy(flops, batch_size, memory_gb)
carbon = estimator.carbon_footprint(energy)
return {
"flops_estimate": flops,
"energy_uj": energy,
"energy_mj": energy / 1e6,
"carbon_per_query_ug": carbon * 1e6, # ΞΌg CO2
"params": total_params,
"model_size_mb": total_params * 4 / 1e6,
"hardware": estimator.profile.name,
}