Spaces:

Aatricks
/

LightDiffusion-Next

Running on Zero

App Files Files Community

LightDiffusion-Next / src /NeuralNetwork /flux2 /layers.py

Aatricks

Deploy ZeroGPU Gradio Space snapshot

b701455 25 days ago

raw

history blame contribute delete

29.7 kB

	"""Flux2 transformer layers for LightDiffusion-Next.

	Core building blocks for the Flux2 architecture:
	- Attention mechanisms
	- Modulation layers
	- Transformer blocks (double and single stream)
	- Embedding layers

	Adapted from ComfyUI's Flux implementation for LightDiffusion-Next.
	"""

	import math
	from dataclasses import dataclass
	from typing import Optional

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from einops import rearrange

	from src.cond import cast as ops_module
	from src.Device import Device


	# Get operations module
	def get_ops():
	"""Get the operations module for weight initialization."""
	return ops_module.disable_weight_init


	class RMSNorm(nn.Module):
	"""Root Mean Square Layer Normalization.

	Uses native PyTorch rms_norm when available for numerical consistency with ComfyUI.
	"""

	def __init__(self, dim: int, eps: float = 1e-6, dtype=None, device=None):
	super().__init__()
	self.eps = eps
	# Use 'scale' to match Flux2 checkpoint naming convention
	self.scale = nn.Parameter(torch.ones(dim, dtype=dtype, device=device))
	# Check if native rms_norm is available
	self._use_native = hasattr(torch.nn.functional, 'rms_norm')

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	# Ensure scale is on the same device as input
	scale = self.scale.to(x.device, x.dtype)

	if self._use_native and not (torch.jit.is_tracing() or torch.jit.is_scripting()):
	# Use native PyTorch rms_norm for better precision (matches ComfyUI)
	return torch.nn.functional.rms_norm(x, scale.shape, weight=scale, eps=self.eps)
	else:
	# Fallback implementation
	rms = torch.rsqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + self.eps)
	return x * rms * scale


	class EmbedND(nn.Module):
	"""N-dimensional positional embedding using RoPE."""

	def __init__(self, dim: int, theta: int, axes_dim: list[int]):
	super().__init__()
	self.dim = dim
	self.theta = theta
	self.axes_dim = axes_dim

	def forward(self, ids: torch.Tensor) -> torch.Tensor:
	"""Compute rotary positional embeddings.

	Args:
	ids: Position IDs tensor of shape [batch, seq_len, num_axes]

	Returns:
	Rotary embeddings of shape [batch, seq_len, dim]
	"""
	n_axes = ids.shape[-1]
	emb = torch.cat(
	[rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
	dim=-3,
	)
	return emb.unsqueeze(1)


	def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
	"""Compute rotary position embeddings.

	Matches ComfyUI's implementation exactly for numerical precision.

	Args:
	pos: Position indices
	dim: Embedding dimension
	theta: Base frequency

	Returns:
	Rotary embeddings as float32 concatenation of cos and sin
	"""
	assert dim % 2 == 0
	device = pos.device

	# ComfyUI uses float64 for scale calculation for maximum precision
	scale = torch.linspace(0, (dim - 2) / dim, dim // 2, dtype=torch.float64, device=device)
	omega = 1.0 / (theta ** scale)

	# Einsum for position-frequency interaction - cast pos to float32 like ComfyUI
	out = torch.einsum("...n,d->...nd", pos.to(dtype=torch.float32, device=device), omega)

	out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
	out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)

	# ComfyUI always returns float32 for RoPE embeddings
	return out.to(dtype=torch.float32, device=pos.device)


	class MLPEmbedder(nn.Module):
	"""MLP for timestep and guidance embeddings."""

	def __init__(self, in_dim: int, hidden_dim: int, dtype=None, device=None, operations=None, ops_bias: bool = True):
	super().__init__()
	if operations is None:
	operations = get_ops()
	self.in_layer = operations.Linear(in_dim, hidden_dim, bias=ops_bias, dtype=dtype, device=device)
	self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=ops_bias, dtype=dtype, device=device)
	self.silu = nn.SiLU()

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.out_layer(self.silu(self.in_layer(x)))


	class GatedMLP(nn.Module):
	"""Gated MLP (SwiGLU) for Klein models.

	Structure: hidden -> 2*intermediate -> SiLU gate -> intermediate -> hidden
	The first linear produces gate and value activations,
	SiLU is applied to gate, then gate * value, then final projection.
	"""

	def __init__(self, hidden_size: int, intermediate_size: int, dtype=None, device=None, operations=None, ops_bias: bool = True):
	super().__init__()
	if operations is None:
	operations = get_ops()
	# First layer outputs 2x intermediate for gating
	self.gate_up_proj = operations.Linear(hidden_size, intermediate_size * 2, bias=ops_bias, dtype=dtype, device=device)
	self.down_proj = operations.Linear(intermediate_size, hidden_size, bias=ops_bias, dtype=dtype, device=device)
	self.act = nn.SiLU()

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	gate_up = self.gate_up_proj(x)
	gate, up = gate_up.chunk(2, dim=-1)
	return self.down_proj(self.act(gate) * up)


	class QKNorm(nn.Module):
	"""Query-Key normalization layer."""

	def __init__(self, dim: int, dtype=None, device=None, operations=None):
	super().__init__()
	# Use native RMSNorm instead of operations.RMSNorm
	self.query_norm = RMSNorm(dim, dtype=dtype, device=device)
	self.key_norm = RMSNorm(dim, dtype=dtype, device=device)

	def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
	q = self.query_norm(q)
	k = self.key_norm(k)
	# Cast to v's dtype and device to match ComfyUI (crucial for numerical consistency)
	return q.to(v), k.to(v)


	class SelfAttention(nn.Module):
	"""Self-attention with rotary position embedding (RoPE)."""

	def __init__(
	self,
	dim: int,
	num_heads: int = 8,
	qkv_bias: bool = False,
	dtype=None,
	device=None,
	operations=None,
	ops_bias: bool = True,
	):
	super().__init__()
	if operations is None:
	operations = get_ops()
	self.num_heads = num_heads
	head_dim = dim // num_heads

	self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
	self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
	self.proj = operations.Linear(dim, dim, bias=ops_bias, dtype=dtype, device=device)

	def forward(self, x: torch.Tensor, pe: torch.Tensor) -> torch.Tensor:
	qkv = self.qkv(x)
	q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
	q, k = self.norm(q, k, v)
	x = attention(q, k, v, pe=pe)
	x = self.proj(x)
	return x


	def attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, pe: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
	"""Apply attention with rotary position embeddings.

	Args:
	q: Query tensor [batch, heads, seq, dim]
	k: Key tensor [batch, heads, seq, dim]
	v: Value tensor [batch, heads, seq, dim]
	pe: Positional embeddings
	mask: Optional attention mask for padding tokens

	Returns:
	Attention output [batch, seq, heads*dim]
	"""
	# Validate positional embedding sequence length to prevent RoPE shape errors
	if pe is not None:
	try:
	pe_seq = pe.shape[2] if pe.ndim >= 3 else None
	if pe_seq not in (1, q.shape[2]):
	raise ValueError(
	f"RoPE sequence length mismatch: pe.seq={pe_seq} != q.seq={q.shape[2]}. "
	"Transformer options (img_h/img_w) may not match the input token grid; check calc_cond_batch merging of transformer_options."
	)
	except Exception:
	# Re-raise as a clear ValueError for easier debugging
	raise

	q, k = apply_rope(q, k, pe)

	# Efficient attention implementation
	heads = q.shape[1]
	x = optimized_attention(q, k, v, heads, mask=mask)
	return x


	def apply_rope1(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
	"""Apply rotary position embedding to a single tensor.

	Correctly applies the 2x2 rotation matrix:
	y1 = x1 * cos - x2 * sin
	y2 = x1 * sin + x2 * cos

	Args:
	x: Input tensor [batch, heads, seq, dim]
	freqs_cis: Frequency tensor [batch, 1, seq, dim//2, 2, 2]

	Returns:
	Rotated tensor [batch, heads, seq, dim]
	"""
	# Reshape x to match RoPE components [batch, heads, seq, dim//2, 2]
	x_reshaped = x.reshape(*x.shape[:-1], -1, 2)

	# Handle differing sequence lengths between x and freqs_cis
	# freqs_cis shape: [batch, 1, seq_pe, dim//2, 2, 2]
	seq_x = x.shape[2]
	seq_pe = freqs_cis.shape[2]
	if seq_pe != seq_x:
	if seq_pe < seq_x:
	# Upsample by repeating along sequence dimension then slice to exact length
	repeat = (seq_x + seq_pe - 1) // seq_pe
	freqs_cis = freqs_cis.repeat_interleave(repeat, dim=2)[..., :seq_x, :, :, :]
	else:
	# Slice to match x sequence length
	freqs_cis = freqs_cis[..., :seq_x, :, :, :]

	# Sanity-check: feature dimension (half of head dim) must match freqs_cis
	feat_half = x.shape[-1] // 2
	if freqs_cis.shape[-3] != feat_half:
	raise ValueError(
	f"RoPE feature-dim mismatch: freqs_cis.dim={freqs_cis.shape[-3]} != x.dim/2={feat_half}. "
	f"x.shape={x.shape}, freqs_cis.shape={freqs_cis.shape}"
	)

	# Extract rotation matrix components
	# freqs_cis is [..., dim//2, row, col]
	# row 0: [cos, -sin]
	# row 1: [sin, cos]
	cos = freqs_cis[..., 0, 0]
	msin = freqs_cis[..., 0, 1] # -sin
	sin = freqs_cis[..., 1, 0]

	x1 = x_reshaped[..., 0]
	x2 = x_reshaped[..., 1]

	# Apply rotation
	out1 = x1 * cos + x2 * msin
	out2 = x1 * sin + x2 * cos

	# Combine and reshape back to original
	return torch.stack([out1, out2], dim=-1).reshape(*x.shape).type_as(x)


	def apply_rope(q: torch.Tensor, k: torch.Tensor, pe: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
	"""Apply rotary position embeddings to queries and keys.

	Args:
	q: Query tensor [batch, heads, seq, dim]
	k: Key tensor [batch, heads, seq, dim]
	pe: Positional embeddings [..., dim//2, 2, 2]

	Returns:
	Rotated (q, k) tensors
	"""
	return apply_rope1(q, pe), apply_rope1(k, pe)


	def optimized_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, heads: int, mask: torch.Tensor = None) -> torch.Tensor:
	"""Optimized attention using Flash/SDPA with fallback to xformers.

	Performance priority: cuDNN > Flash > SDPA > xformers > naive
	Uses SDPA backend priority from Device module for optimal dispatch.
	"""
	b, _, seq_q, dim = q.shape
	_, _, seq_kv, _ = k.shape

	# Method 1: Use native scaled_dot_product_attention with backend priority
	# This is the fastest path on modern PyTorch with GPU support
	if hasattr(torch.nn.functional, 'scaled_dot_product_attention'):
	try:
	# Get SDPA backend priority context manager from Device
	sdpa_context = Device.get_sdpa_context()

	# Process attention mask for SDPA if provided
	attn_mask = None
	if mask is not None:
	# Add dimensions as needed: [B, L] -> [B, 1, 1, L] for broadcasting
	if mask.ndim == 2:
	attn_mask = mask.unsqueeze(1).unsqueeze(1) # [B, 1, 1, L]
	elif mask.ndim == 3:
	attn_mask = mask.unsqueeze(1) # [B, 1, L, L]
	else:
	attn_mask = mask
	# Convert mask to additive form (0 for attend, -inf for mask)
	# Input mask is 1 for valid, 0 for invalid (padding)
	attn_mask = attn_mask.to(dtype=q.dtype)
	attn_mask = (1.0 - attn_mask) * torch.finfo(q.dtype).min

	# SDPA expects [batch, heads, seq, dim] - q/k/v are already in this format
	with sdpa_context:
	out = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)

	# Reshape: [batch, heads, seq, dim] -> [batch, seq, heads*dim]
	# Use transpose + view for efficiency (avoid copy)
	out = out.transpose(1, 2).reshape(b, seq_q, -1)
	return out
	except Exception:
	pass # Fall through to xformers

	# Method 2: Use xformers memory-efficient attention
	if Device.xformers_enabled():
	try:
	import xformers.ops as xops
	# xformers expects [batch, seq, heads, dim]
	q_xf = q.transpose(1, 2).contiguous()
	k_xf = k.transpose(1, 2).contiguous()
	v_xf = v.transpose(1, 2).contiguous()
	# Note: xformers has different mask format, conversion would be needed
	out = xops.memory_efficient_attention(q_xf, k_xf, v_xf)
	del q_xf, k_xf, v_xf # Free memory early
	# Reshape: [batch, seq, heads, dim] -> [batch, seq, heads*dim]
	out = out.reshape(b, seq_q, -1)
	return out
	except Exception:
	pass # Fall through to naive

	# Method 3: Naive implementation (slowest, memory intensive)
	out = F.scaled_dot_product_attention(q, k, v)
	out = out.transpose(1, 2).reshape(b, seq_q, -1)
	return out


	@dataclass
	class ModulationOut:
	"""Output of modulation layer."""
	shift: torch.Tensor
	scale: torch.Tensor
	gate: torch.Tensor


	class Modulation(nn.Module):
	"""Adaptive layer normalization modulation.

	Applies shift, scale, and gate from conditioning vector.
	"""

	def __init__(self, dim: int, double: bool, dtype=None, device=None, operations=None, ops_bias: bool = True):
	super().__init__()
	if operations is None:
	operations = get_ops()
	self.is_double = double
	self.multiplier = 6 if double else 3
	self.lin = operations.Linear(dim, self.multiplier * dim, bias=ops_bias, dtype=dtype, device=device)

	def forward(self, vec: torch.Tensor) -> tuple[ModulationOut, ModulationOut \| None]:
	out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)

	mod1 = ModulationOut(shift=out[0], scale=out[1], gate=out[2])
	mod2 = ModulationOut(shift=out[3], scale=out[4], gate=out[5]) if self.is_double else None
	return mod1, mod2


	class GlobalModulation(nn.Module):
	"""Global modulation for Flux2 (Klein) double stream blocks."""

	def __init__(self, dim: int, dtype=None, device=None, operations=None, ops_bias: bool = True):
	super().__init__()
	if operations is None:
	operations = get_ops()
	# 12 outputs: 6 for img stream, 6 for txt stream
	self.lin = operations.Linear(dim, 12 * dim, bias=ops_bias, dtype=dtype, device=device)

	def forward(self, vec: torch.Tensor) -> tuple[ModulationOut, ModulationOut, ModulationOut, ModulationOut]:
	out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(12, dim=-1)

	mod1_img = ModulationOut(shift=out[0], scale=out[1], gate=out[2])
	mod2_img = ModulationOut(shift=out[3], scale=out[4], gate=out[5])
	mod1_txt = ModulationOut(shift=out[6], scale=out[7], gate=out[8])
	mod2_txt = ModulationOut(shift=out[9], scale=out[10], gate=out[11])

	return mod1_img, mod2_img, mod1_txt, mod2_txt


	class DoubleStreamBlock(nn.Module):
	"""Transformer block with separate image and text streams.

	Uses joint attention but separate MLPs for image and text.
	"""

	def __init__(
	self,
	hidden_size: int,
	num_heads: int,
	mlp_ratio: float,
	qkv_bias: bool = False,
	global_modulation: bool = False,
	dtype=None,
	device=None,
	operations=None,
	flax_compatible: bool = False,
	silu_mlp: bool = False,
	gated_mlp: bool = False,
	ops_bias: bool = True, # Whether to use bias in linear layers
	):
	super().__init__()
	if operations is None:
	operations = get_ops()

	self.hidden_size = hidden_size
	self.num_heads = num_heads
	self.flax_compatible = flax_compatible
	self.silu_mlp = silu_mlp
	self.gated_mlp = gated_mlp

	# For gated MLP (Klein), mlp_ratio is the true ratio
	# First layer outputs 2x for gating: hidden -> 2*intermediate
	# Second layer: intermediate -> hidden
	if gated_mlp:
	mlp_intermediate = int(hidden_size * mlp_ratio)
	mlp_hidden_dim = mlp_intermediate * 2 # Double for gate+up projection
	else:
	mlp_hidden_dim = int(hidden_size * mlp_ratio)
	mlp_intermediate = mlp_hidden_dim

	if global_modulation:
	# When using global modulation at model level, don't create per-block modulation
	self.double_stream_modulation = None
	self.img_mod = None
	self.txt_mod = None
	self.use_global_modulation = True
	else:
	self.double_stream_modulation = None
	self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations, ops_bias=ops_bias)
	self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations, ops_bias=ops_bias)
	self.use_global_modulation = False

	# Image stream
	self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
	self.img_attn = SelfAttention(hidden_size, num_heads, qkv_bias, dtype=dtype, device=device, operations=operations, ops_bias=ops_bias)
	self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)

	if gated_mlp:
	# Gated MLP with naming compatible with checkpoint: .0, .1 (identity), .2
	self.img_mlp = nn.Sequential(
	operations.Linear(hidden_size, mlp_hidden_dim, bias=ops_bias, dtype=dtype, device=device),
	nn.Identity(), # Placeholder for index 1
	operations.Linear(mlp_intermediate, hidden_size, bias=ops_bias, dtype=dtype, device=device),
	)
	else:
	self.img_mlp = nn.Sequential(
	operations.Linear(hidden_size, mlp_hidden_dim, bias=ops_bias, dtype=dtype, device=device),
	nn.SiLU() if silu_mlp else nn.GELU(approximate="tanh"),
	operations.Linear(mlp_hidden_dim, hidden_size, bias=ops_bias, dtype=dtype, device=device),
	)

	# Text stream
	self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
	self.txt_attn = SelfAttention(hidden_size, num_heads, qkv_bias, dtype=dtype, device=device, operations=operations, ops_bias=ops_bias)
	self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)

	if gated_mlp:
	self.txt_mlp = nn.Sequential(
	operations.Linear(hidden_size, mlp_hidden_dim, bias=ops_bias, dtype=dtype, device=device),
	nn.Identity(),
	operations.Linear(mlp_intermediate, hidden_size, bias=ops_bias, dtype=dtype, device=device),
	)
	else:
	self.txt_mlp = nn.Sequential(
	operations.Linear(hidden_size, mlp_hidden_dim, bias=ops_bias, dtype=dtype, device=device),
	nn.SiLU() if silu_mlp else nn.GELU(approximate="tanh"),
	operations.Linear(mlp_hidden_dim, hidden_size, bias=ops_bias, dtype=dtype, device=device),
	)

	def forward(
	self,
	img: torch.Tensor,
	txt: torch.Tensor,
	vec: torch.Tensor,
	pe: torch.Tensor,
	attn_mask=None,
	img_mod: tuple = None, # (img_mod1, img_mod2) from global modulation
	txt_mod: tuple = None, # (txt_mod1, txt_mod2) from global modulation
	) -> tuple[torch.Tensor, torch.Tensor]:
	# Get modulation parameters
	if self.use_global_modulation and img_mod is not None and txt_mod is not None:
	# Use global modulation passed from model level
	img_mod1, img_mod2 = img_mod
	txt_mod1, txt_mod2 = txt_mod
	elif self.img_mod is not None and self.txt_mod is not None:
	# Use per-block modulation (Flux1 style)
	img_mod1, img_mod2 = self.img_mod(vec)
	txt_mod1, txt_mod2 = self.txt_mod(vec)
	else:
	raise ValueError("No modulation available - either provide global or use per-block modulation")

	# Prepare normed inputs
	img_normed = self.img_norm1(img)
	img_modulated = (1 + img_mod1.scale) * img_normed + img_mod1.shift
	del img_normed # Free memory early

	txt_normed = self.txt_norm1(txt)
	txt_modulated = (1 + txt_mod1.scale) * txt_normed + txt_mod1.shift
	del txt_normed # Free memory early

	# Run joint attention - use view+permute for efficiency instead of rearrange
	img_qkv = self.img_attn.qkv(img_modulated)
	del img_modulated
	q_img, k_img, v_img = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
	del img_qkv

	txt_qkv = self.txt_attn.qkv(txt_modulated)
	del txt_modulated
	q_txt, k_txt, v_txt = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
	del txt_qkv

	q_img, k_img = self.img_attn.norm(q_img, k_img, v_img)
	q_txt, k_txt = self.txt_attn.norm(q_txt, k_txt, v_txt)

	# Concatenate for joint attention
	q = torch.cat((q_txt, q_img), dim=2)
	del q_txt, q_img
	k = torch.cat((k_txt, k_img), dim=2)
	del k_txt, k_img
	v = torch.cat((v_txt, v_img), dim=2)
	del v_txt, v_img

	attn_out = attention(q, k, v, pe=pe, mask=attn_mask)
	del q, k, v
	txt_attn, img_attn = attn_out[:, : txt.shape[1]], attn_out[:, txt.shape[1] :]
	del attn_out

	# Apply residual connections with gating
	img = img + img_mod1.gate * self.img_attn.proj(img_attn)
	del img_attn
	txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
	del txt_attn

	# MLP with modulation
	img_mlp_in = (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
	img = img + img_mod2.gate * self._forward_mlp(self.img_mlp, img_mlp_in)
	del img_mlp_in

	txt_mlp_in = (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
	txt = txt + txt_mod2.gate * self._forward_mlp(self.txt_mlp, txt_mlp_in)
	del txt_mlp_in

	# Handle fp16 numerical issues (matches ComfyUI exactly)
	if txt.dtype == torch.float16:
	txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)

	return img, txt

	def _forward_mlp(self, mlp: nn.Sequential, x: torch.Tensor) -> torch.Tensor:
	"""Forward through MLP, handling both standard and gated variants."""
	if self.gated_mlp:
	# Gated MLP: split into gate and up, apply SiLU to gate, multiply, project
	gate_up = mlp[0](x)
	gate, up = gate_up.chunk(2, dim=-1)
	hidden = F.silu(gate) * up
	return mlp[2](hidden)
	else:
	return mlp(x)


	class SingleStreamBlock(nn.Module):
	"""Transformer block with merged image and text stream.

	Used after the double stream blocks have processed both modalities.
	"""

	def __init__(
	self,
	hidden_size: int,
	num_heads: int,
	mlp_ratio: float = 4.0,
	qk_scale: float = None,
	dtype=None,
	device=None,
	operations=None,
	silu_mlp: bool = False,
	gated_mlp: bool = False,
	ops_bias: bool = True,
	global_modulation: bool = False,
	):
	super().__init__()
	if operations is None:
	operations = get_ops()

	self.hidden_dim = hidden_size
	self.num_heads = num_heads
	head_dim = hidden_size // num_heads
	self.scale = qk_scale or head_dim ** -0.5
	self.silu_mlp = silu_mlp
	self.gated_mlp = gated_mlp
	self.use_global_modulation = global_modulation

	# For gated MLP, mlp_ratio gives intermediate size
	# linear1 outputs gate+up (2x intermediate), linear2 takes intermediate
	if gated_mlp:
	self.mlp_intermediate = int(hidden_size * mlp_ratio)
	self.mlp_gate_up_dim = self.mlp_intermediate * 2
	linear1_out = hidden_size * 3 + self.mlp_gate_up_dim
	linear2_in = hidden_size + self.mlp_intermediate
	else:
	self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
	linear1_out = hidden_size * 3 + self.mlp_hidden_dim
	linear2_in = hidden_size + self.mlp_hidden_dim

	# Joint QKV and MLP projection
	self.linear1 = operations.Linear(
	hidden_size, linear1_out, bias=ops_bias, dtype=dtype, device=device
	)
	self.linear2 = operations.Linear(
	linear2_in, hidden_size, bias=ops_bias, dtype=dtype, device=device
	)

	self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
	self.hidden_size = hidden_size
	self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)

	# Only create per-block modulation if not using global modulation
	if not global_modulation:
	self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations, ops_bias=ops_bias)
	else:
	self.modulation = None

	def forward(
	self,
	x: torch.Tensor,
	vec: torch.Tensor,
	pe: torch.Tensor,
	attn_mask=None,
	modulation=None, # ModulationOut from global modulation
	) -> torch.Tensor:
	# Get modulation
	if self.use_global_modulation and modulation is not None:
	mod = modulation
	elif self.modulation is not None:
	mod, _ = self.modulation(vec)
	else:
	raise ValueError("No modulation available - either provide global or use per-block modulation")

	x_normed = self.pre_norm(x)
	x_mod = (1 + mod.scale) * x_normed + mod.shift
	del x_normed # Free memory early

	# Joint projection - split QKV from MLP part
	qkv_mlp = self.linear1(x_mod)
	del x_mod

	if self.gated_mlp:
	qkv, mlp_gate_up = qkv_mlp.split([self.hidden_size * 3, self.mlp_gate_up_dim], dim=-1)
	del qkv_mlp
	# Gated MLP: split into gate and up, apply SiLU to gate, multiply
	gate, up = mlp_gate_up.chunk(2, dim=-1)
	del mlp_gate_up
	mlp = F.silu(gate) * up
	del gate, up
	else:
	qkv, mlp = qkv_mlp.split([self.hidden_size * 3, self.mlp_hidden_dim], dim=-1)
	del qkv_mlp
	# Standard activation
	if self.silu_mlp:
	mlp = F.silu(mlp)
	else:
	mlp = F.gelu(mlp, approximate="tanh")

	# Attention - use view+permute for efficiency instead of rearrange
	q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
	del qkv
	q, k = self.norm(q, k, v)

	attn = attention(q, k, v, pe=pe, mask=attn_mask)
	del q, k, v

	# Combine and project
	output = self.linear2(torch.cat((attn, mlp), dim=-1))
	del attn, mlp

	result = x + mod.gate * output

	# Handle fp16 numerical issues (matches ComfyUI exactly)
	if result.dtype == torch.float16:
	result = torch.nan_to_num(result, nan=0.0, posinf=65504, neginf=-65504)

	return result


	class LastLayer(nn.Module):
	"""Final layer for unpatchifying and producing output."""

	def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None, ops_bias: bool = True):
	super().__init__()
	if operations is None:
	operations = get_ops()

	self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
	self.linear = operations.Linear(
	hidden_size, patch_size * patch_size * out_channels, bias=ops_bias, dtype=dtype, device=device
	)
	self.adaLN_modulation = nn.Sequential(
	nn.SiLU(),
	operations.Linear(hidden_size, 2 * hidden_size, bias=ops_bias, dtype=dtype, device=device),
	)

	def forward(self, x: torch.Tensor, vec: torch.Tensor) -> torch.Tensor:
	shift, scale = self.adaLN_modulation(vec).chunk(2, dim=-1)
	x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
	x = self.linear(x)
	return x