Update model weights after training (epoch 3, loss 2.9528)

f367b84 verified 2 months ago

434 kB

	"""
	Xoron Model for HuggingFace Transformers - Self-Contained Implementation.

	AUTO-GENERATED FILE - Do not edit directly!

	This module provides a complete, self-contained HuggingFace-compatible model class
	for the Xoron multimodal model. All components are embedded directly in this file
	to enable loading via AutoModel with trust_remote_code=True WITHOUT requiring
	the full Xoron-Dev package to be installed.

	Usage:
	from transformers import AutoModel, AutoConfig
	config = AutoConfig.from_pretrained("your-repo/xoron-model", trust_remote_code=True)
	model = AutoModel.from_pretrained("your-repo/xoron-model", trust_remote_code=True)
	"""

	import os
	import math
	import json
	import logging
	from dataclasses import dataclass, field
	from typing import Optional, Dict, List, Union, Tuple, Any

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	try:
	from safetensors.torch import save_file, load_file
	except ImportError:
	save_file, load_file = None, None

	from transformers import PreTrainedModel, LlamaConfig, LlamaModel, LlamaForCausalLM
	from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutputWithPast

	try:
	from transformers.models.llama.modeling_llama import (
	LlamaAttention, LlamaDecoderLayer, LlamaRMSNorm, LlamaMLP,
	LlamaRotaryEmbedding, apply_rotary_pos_emb, repeat_kv
	)
	except ImportError:
	LlamaAttention = LlamaDecoderLayer = LlamaRMSNorm = LlamaMLP = None
	LlamaRotaryEmbedding = apply_rotary_pos_emb = repeat_kv = None


	try:
	from .configuration_xoron import XoronConfig
	except ImportError:
	from configuration_xoron import XoronConfig

	logger = logging.getLogger(__name__)




	==============================================================================
	MODELS.COMPONENTS.LORA
	==============================================================================

	class LoRALinear (nn .Module ):
	"""
	SOTA LoRA layer with multiple variants.

	Supports:
	- Standard LoRA
	- DoRA (Weight-Decomposed LoRA)
	- rsLoRA (rank-stabilized scaling)

	MEMORY OPTIMIZATION:
	- Does NOT clone base weights - shares them with original module
	- Only LoRA params (A, B, magnitude) consume additional memory
	- Base weights are frozen and can be kept in lower precision
	"""
	def __init__ (
	self ,
	in_features :int ,
	out_features :int ,
	r :int =8 ,
	lora_alpha :int =16 ,
	lora_dropout :float =0.05 ,
	merge_weights :bool =False ,
	use_dora :bool =False ,
	use_rslora :bool =True ,
	base_layer :nn .Linear =None ,
	):
	super ().__init__ ()
	self .r =r
	self .lora_alpha =lora_alpha
	self .merge_weights =merge_weights
	self .merged =False
	self .use_dora =use_dora
	self .use_rslora =use_rslora
	self .in_features =in_features
	self .out_features =out_features



	if base_layer is not None :

	self .linear =base_layer
	else :

	self .linear =nn .Linear (in_features ,out_features ,bias =False )


	if r >0 :
	self .lora_A =nn .Parameter (torch .zeros (r ,in_features ))
	self .lora_B =nn .Parameter (torch .zeros (out_features ,r ))


	if use_rslora :

	self .scaling =lora_alpha /math .sqrt (r )
	else :
	self .scaling =lora_alpha /r

	self .lora_dropout =nn .Dropout (p =lora_dropout )if lora_dropout >0 else nn .Identity ()


	nn .init .kaiming_uniform_ (self .lora_A ,a =math .sqrt (5 ))
	nn .init .zeros_ (self .lora_B )


	if use_dora :
	self .magnitude =nn .Parameter (torch .ones (out_features ))



	self .linear .weight .requires_grad =False
	if hasattr (self .linear ,'bias')and self .linear .bias is not None :
	self .linear .bias .requires_grad =False

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	if self .r >0 and not self .merged :

	lora_out =self .lora_dropout (x )@self .lora_A .T @self .lora_B .T *self .scaling

	if self .use_dora :


	weight =self .linear .weight +(self .lora_B @self .lora_A )*self .scaling
	weight_norm =weight .norm (dim =1 ,keepdim =True )

	weight_normalized =weight /(weight_norm +1e-6 )
	result =F .linear (x ,weight_normalized *self .magnitude .unsqueeze (1 ))
	else :
	result =self .linear (x )+lora_out
	else :
	result =self .linear (x )

	return result

	def merge_lora_weights (self ):
	"""Merge LoRA weights into the main weights for inference."""
	if self .r >0 and not self .merged :
	delta =(self .lora_B @self .lora_A )*self .scaling
	if self .use_dora :


	weight =self .linear .weight +delta
	weight_norm =weight .norm (dim =1 ,keepdim =True )
	self .linear .weight .data =(weight /(weight_norm +1e-6 ))*self .magnitude .unsqueeze (1 )
	else :
	self .linear .weight .data +=delta
	self .merged =True

	def unmerge_lora_weights (self ):
	"""Unmerge LoRA weights for continued training."""
	if self .r >0 and self .merged :

	self .linear .weight .data -=(self .lora_B @self .lora_A )*self .scaling
	self .merged =False


	class LoRAConfig :
	"""
	Configuration for SOTA LoRA adaptation.

	Supports multiple LoRA variants and configurations.
	"""
	def __init__ (
	self ,
	r :int =8 ,
	lora_alpha :int =16 ,
	lora_dropout :float =0.05 ,
	target_modules :Optional [List [str ]]=None ,
	enable_lora :bool =True ,
	use_dora :bool =False ,
	use_rslora :bool =True ,
	lora_plus_lr_ratio :float =16.0 ,
	):
	self .r =r
	self .lora_alpha =lora_alpha
	self .lora_dropout =lora_dropout
	self .target_modules =target_modules or [
	'q_proj','k_proj','v_proj','o_proj',
	'gate_proj','up_proj','down_proj',
	]
	self .enable_lora =enable_lora
	self .use_dora =use_dora
	self .use_rslora =use_rslora
	self .lora_plus_lr_ratio =lora_plus_lr_ratio


	def apply_lora_to_model (model :nn .Module ,lora_config :LoRAConfig )->nn .Module :
	"""
	Apply LoRA to specified modules in a model.
	Returns the model with LoRA layers applied.

	MEMORY OPTIMIZATION:
	- Passes the original nn.Linear layer directly to LoRALinear
	- This SHARES weights instead of cloning them (saves ~50% memory for target modules)
	- Only LoRA parameters (A, B, magnitude) are newly allocated

	For a 16GB model with 30% of weights in target modules:
	- Old behavior: Clone ~5GB = 21GB total
	- New behavior: Share weights = 16GB + ~50MB LoRA params
	"""
	if not lora_config .enable_lora :
	return model

	lora_layers_added =0
	modules_to_replace =[]
	total_base_params =0

	for name ,module in model .named_modules ():
	if not isinstance (module ,nn .Linear ):
	continue
	module_name =name .split ('.')[-1 ]
	if module_name in lora_config .target_modules :
	modules_to_replace .append ((name ,module ))
	total_base_params +=module .weight .numel ()

	for name ,module in modules_to_replace :
	parts =name .split ('.')
	attr_name =parts [-1 ]
	parent_name ='.'.join (parts [:-1 ])

	if parent_name :
	parent =model .get_submodule (parent_name )
	else :
	parent =model



	lora_layer =LoRALinear (
	in_features =module .in_features ,
	out_features =module .out_features ,
	r =lora_config .r ,
	lora_alpha =lora_config .lora_alpha ,
	lora_dropout =lora_config .lora_dropout ,
	use_dora =lora_config .use_dora ,
	use_rslora =lora_config .use_rslora ,
	base_layer =module ,
	)





	setattr (parent ,attr_name ,lora_layer )
	lora_layers_added +=1


	lora_params =lora_layers_added (lora_config .r (modules_to_replace [0 ][1 ].in_features +modules_to_replace [0 ][1 ].out_features ))if modules_to_replace else 0
	base_mem_saved_mb =(total_base_params 2 )/(1024 1024 )
	lora_mem_added_mb =(lora_params 4 )/(1024 1024 )

	variant ="DoRA"if lora_config .use_dora else ("rsLoRA"if lora_config .use_rslora else "LoRA")
	print (f"✅ {variant } applied to {lora_layers_added } layers (r={lora_config .r }, alpha={lora_config .lora_alpha })")
	print (f" 💾 Memory optimization: {base_mem_saved_mb :.1f}MB base weights SHARED (not cloned)")
	print (f" 📊 New LoRA params: ~{lora_mem_added_mb :.1f}MB (trainable)")
	return model


	def get_lora_parameters (model :nn .Module )->List [nn .Parameter ]:
	"""
	Get only the LoRA parameters from a model.

	NOTE: This does NOT change requires_grad on any parameters!
	It simply returns the LoRA params (lora_A, lora_B, magnitude).

	Use this when you want to get LoRA params for separate optimizer groups
	or for LoRA-only training mode.
	"""
	lora_params =[]

	for name ,param in model .named_parameters ():
	if 'lora_A'in name or 'lora_B'in name or 'magnitude'in name :
	lora_params .append (param )

	return lora_params


	def enable_lora_training (model :nn .Module )->List [nn .Parameter ]:
	"""
	Enable training for LoRA parameters (ensure requires_grad=True).

	Returns list of LoRA parameters.
	"""
	lora_params =[]

	for name ,param in model .named_parameters ():
	if 'lora_A'in name or 'lora_B'in name or 'magnitude'in name :
	param .requires_grad =True
	lora_params .append (param )

	return lora_params


	def freeze_non_lora_params (model :nn .Module )->int :
	"""
	Freeze all non-LoRA parameters and clear their gradients.

	USE THIS ONLY FOR LORA-ONLY TRAINING MODE (train_lora_only=True).

	For normal training with parallel fine-tuning (LoRA + full weights on
	active components), use the model's freeze_components() method instead,
	which respects the training mode flags (--text, --video, --image, --voice).

	Returns:
	Number of frozen parameters
	"""
	frozen_params =0
	freed_memory =0

	for name ,param in model .named_parameters ():
	is_lora ='lora_A'in name or 'lora_B'in name or 'magnitude'in name
	if not is_lora :
	param .requires_grad =False
	frozen_params +=param .numel ()

	if param .grad is not None :
	freed_memory +=param .grad .numel ()*param .grad .element_size ()
	param .grad =None

	print (f" ❄️ Frozen {frozen_params :,} non-LoRA parameters")
	if freed_memory >0 :
	print (f" 🧹 Freed {freed_memory /(1024 **2 ):.1f}MB of gradient memory")

	return frozen_params


	def get_lora_plus_param_groups (
	model :nn .Module ,
	base_lr :float ,
	lr_ratio :float =16.0
	)->List [Dict ]:
	"""
	Get parameter groups for LoRA+ training.

	LoRA+ uses different learning rates for A and B matrices:
	- B matrix: base_lr * lr_ratio (learns faster)
	- A matrix: base_lr

	This improves convergence and final performance.
	"""
	lora_a_params =[]
	lora_b_params =[]
	magnitude_params =[]
	other_params =[]

	for name ,param in model .named_parameters ():
	if not param .requires_grad :
	continue
	if 'lora_A'in name :
	lora_a_params .append (param )
	elif 'lora_B'in name :
	lora_b_params .append (param )
	elif 'magnitude'in name :
	magnitude_params .append (param )
	else :
	other_params .append (param )

	param_groups =[]

	if lora_a_params :
	param_groups .append ({'params':lora_a_params ,'lr':base_lr ,'name':'lora_A'})
	if lora_b_params :
	param_groups .append ({'params':lora_b_params ,'lr':base_lr *lr_ratio ,'name':'lora_B'})
	if magnitude_params :
	param_groups .append ({'params':magnitude_params ,'lr':base_lr ,'name':'magnitude'})
	if other_params :
	param_groups .append ({'params':other_params ,'lr':base_lr ,'name':'other'})

	return param_groups


	def get_trainable_parameters (model :nn .Module ,train_lora_only :bool =False )->List [nn .Parameter ]:
	"""Get trainable parameters, optionally only LoRA params."""
	if train_lora_only :
	return get_lora_parameters (model )
	else :
	return [p for p in model .parameters ()if p .requires_grad ]


	def count_lora_parameters (model :nn .Module )->Tuple [int ,int ,float ]:
	"""
	Count LoRA parameters vs total parameters.

	Returns:
	(lora_params, total_params, percentage)
	"""
	lora_params =0
	total_params =0

	for name ,param in model .named_parameters ():
	total_params +=param .numel ()
	if 'lora_A'in name or 'lora_B'in name or 'magnitude'in name :
	lora_params +=param .numel ()

	percentage =100.0 *lora_params /total_params if total_params >0 else 0.0
	return lora_params ,total_params ,percentage


	==============================================================================
	MODELS.COMPONENTS.ATTENTION
	==============================================================================

	logger =logging .getLogger (__name__ )


	def flash_attention_available ()->bool :
	"""Check if Flash Attention (via SDPA) is available."""
	try :
	from torch .nn .functional import scaled_dot_product_attention
	return True
	except ImportError :
	return False


	def compute_qk_scale (head_dim :int )->float :
	"""Compute the Q/K pre-scaling factor for FP16 stability.

	By scaling both Q and K by head_dim^-0.25, the product Q@K^T
	is effectively scaled by head_dim^-0.5 (the standard attention scaling).
	This prevents overflow in FP16 when Q and K have large values.
	"""
	return head_dim **-0.25


	class AttentionKVCache :
	"""Pre-allocated KV Cache — static buffer with index-based filling.

	Eliminates VRAM fragmentation from torch.cat during autoregressive generation.
	Buffer is allocated once at first use and reused via slice assignment.
	"""

	__slots__ =('key_cache','value_cache','seen_tokens','_max_len')

	def __init__ (self ,max_seq_len :int =131072 ):
	self .key_cache :torch .Tensor =None
	self .value_cache :torch .Tensor =None
	self .seen_tokens :int =0
	self ._max_len =max_seq_len

	def _allocate (self ,batch :int ,heads :int ,head_dim :int ,device :torch .device ,dtype :torch .dtype ):
	"""Allocate static buffer on first use."""
	self .key_cache =torch .zeros (batch ,heads ,self ._max_len ,head_dim ,device =device ,dtype =dtype )
	self .value_cache =torch .zeros (batch ,heads ,self ._max_len ,head_dim ,device =device ,dtype =dtype )

	def update (
	self ,
	key_states :torch .Tensor ,
	value_states :torch .Tensor ,
	)->Tuple [torch .Tensor ,torch .Tensor ]:
	"""
	Update cache with new key/value states using index-based filling.

	Args:
	key_states: New key states [batch, num_heads, seq_len, head_dim]
	value_states: New value states [batch, num_heads, seq_len, head_dim]

	Returns:
	Updated key and value states including cache (views, no copy)
	"""
	batch ,heads ,new_len ,head_dim =key_states .shape

	if self .key_cache is None :
	self ._allocate (batch ,heads ,head_dim ,key_states .device ,key_states .dtype )
	self .seen_tokens =0


	if self .seen_tokens +new_len >self .key_cache .shape [2 ]:
	new_max =max (self .key_cache .shape [2 ]*2 ,self .seen_tokens +new_len )
	new_key =torch .zeros (batch ,heads ,new_max ,head_dim ,device =key_states .device ,dtype =key_states .dtype )
	new_val =torch .zeros (batch ,heads ,new_max ,head_dim ,device =key_states .device ,dtype =key_states .dtype )
	new_key [:,:,:self .seen_tokens ]=self .key_cache [:,:,:self .seen_tokens ]
	new_val [:,:,:self .seen_tokens ]=self .value_cache [:,:,:self .seen_tokens ]
	self .key_cache =new_key
	self .value_cache =new_val


	self .key_cache [:,:,self .seen_tokens :self .seen_tokens +new_len ]=key_states
	self .value_cache [:,:,self .seen_tokens :self .seen_tokens +new_len ]=value_states
	self .seen_tokens +=new_len


	return self .key_cache [:,:,:self .seen_tokens ],self .value_cache [:,:,:self .seen_tokens ]

	def get_seq_length (self )->int :
	"""Get current sequence length in cache."""
	return self .seen_tokens

	def reset (self ):
	"""Reset cache position without deallocating the buffer."""
	self .seen_tokens =0


	class FlashAttention (nn .Module ):
	"""
	SOTA Flash Attention with KV cache support and FP16-safe Q/K pre-scaling.

	Uses PyTorch's scaled_dot_product_attention when available,
	with fallback to standard attention. Supports:
	- KV caching for efficient generation
	- Causal masking
	- Attention dropout
	- Pre-scaled Q/K for FP16 stability
	"""

	def __init__ (
	self ,
	dropout :float =0.0 ,
	causal :bool =False ,
	head_dim :int =None ,
	):
	super ().__init__ ()
	self .dropout =dropout
	self .causal =causal
	self ._flash_available =flash_attention_available ()

	self ._head_dim =head_dim
	self ._qk_scale =compute_qk_scale (head_dim )if head_dim else None

	def forward (
	self ,
	query :torch .Tensor ,
	key :torch .Tensor ,
	value :torch .Tensor ,
	attn_mask :torch .Tensor =None ,
	is_causal :bool =None ,
	past_key_value :Tuple [torch .Tensor ,torch .Tensor ]=None ,
	use_cache :bool =False ,
	output_attentions :bool =False ,
	)->Tuple [torch .Tensor ,Tuple [torch .Tensor ,torch .Tensor ],torch .Tensor ]:
	"""
	Forward pass with KV cache support.

	Args:
	query: Query tensor [batch, num_heads, seq_len, head_dim]
	key: Key tensor [batch, num_heads, seq_len, head_dim]
	value: Value tensor [batch, num_heads, seq_len, head_dim]
	attn_mask: Optional attention mask
	is_causal: Override causal setting
	past_key_value: Optional tuple of (past_key, past_value) for KV cache
	use_cache: Whether to return updated KV cache
	output_attentions: Whether to return attention weights

	Returns:
	Tuple of (output, present_key_value, attention_weights)
	"""
	causal =is_causal if is_causal is not None else self .causal
	batch_size ,num_heads ,seq_len ,head_dim =query .shape


	qk_scale =self ._qk_scale if self ._qk_scale else compute_qk_scale (head_dim )


	if past_key_value is not None :
	past_key ,past_value =past_key_value
	key =torch .cat ([past_key ,key ],dim =2 )
	value =torch .cat ([past_value ,value ],dim =2 )


	present_key_value =(key ,value )if use_cache else None

	kv_seq_len =key .shape [2 ]
	attn_weights =None

	if self ._flash_available and not output_attentions :

	query_scaled =query *qk_scale
	key_scaled =key *qk_scale

	dropout_p =self .dropout if self .training else 0.0
	use_causal =causal and attn_mask is None and seq_len >1 and seq_len ==kv_seq_len

	output =F .scaled_dot_product_attention (
	query_scaled ,key_scaled ,value ,
	attn_mask =attn_mask ,
	dropout_p =dropout_p ,
	is_causal =use_causal ,
	scale =1.0 ,
	)
	else :

	scale =1.0 /math .sqrt (head_dim )
	attn_weights =torch .matmul (query ,key .transpose (-2 ,-1 ))*scale


	if causal and attn_mask is None and seq_len >1 :
	causal_mask =torch .triu (
	torch .full ((seq_len ,kv_seq_len ),float ('-inf'),device =query .device ,dtype =query .dtype ),
	diagonal =kv_seq_len -seq_len +1
	)
	attn_weights =attn_weights +causal_mask .unsqueeze (0 ).unsqueeze (0 )

	if attn_mask is not None :
	attn_weights =attn_weights +attn_mask

	attn_weights =F .softmax (attn_weights ,dim =-1 ,dtype =query .dtype )

	if self .training and self .dropout >0 :
	attn_weights =F .dropout (attn_weights ,p =self .dropout )

	output =torch .matmul (attn_weights ,value )

	return output ,present_key_value ,attn_weights


	class MultimodalCrossAttention (nn .Module ):
	"""
	SOTA Cross-attention layer for multimodal fusion with KV cache support.

	Allows text to attend to image/video/audio features with:
	- KV caching for efficient generation
	- Gated residual connection for stable training
	- Flash Attention support with pre-scaled Q/K for FP16 stability
	- Optional attention weight output
	"""

	def __init__ (
	self ,
	hidden_size :int ,
	num_heads :int =8 ,
	dropout :float =0.1 ,
	use_flash_attention :bool =True ,
	gate_init :float =0.0 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_heads =num_heads
	self .head_dim =hidden_size //num_heads
	self .use_flash_attention =use_flash_attention and flash_attention_available ()
	self .dropout_p =dropout


	self .qk_scale =compute_qk_scale (self .head_dim )


	self .q_proj =nn .Linear (hidden_size ,hidden_size ,bias =False )
	self .k_proj =nn .Linear (hidden_size ,hidden_size ,bias =False )
	self .v_proj =nn .Linear (hidden_size ,hidden_size ,bias =False )
	self .o_proj =nn .Linear (hidden_size ,hidden_size ,bias =False )


	self .dropout =nn .Dropout (dropout )
	self .layer_norm =nn .LayerNorm (hidden_size )
	self .gate =nn .Parameter (torch .tensor (gate_init ))

	def forward (
	self ,
	text_hidden :torch .Tensor ,
	modality_hidden :torch .Tensor ,
	modality_mask :torch .Tensor =None ,
	past_key_value :Tuple [torch .Tensor ,torch .Tensor ]=None ,
	use_cache :bool =False ,
	output_attentions :bool =False ,
	)->Tuple [torch .Tensor ,Tuple [torch .Tensor ,torch .Tensor ],torch .Tensor ]:
	"""
	Cross-attention: text attends to modality features with KV cache support.

	Args:
	text_hidden: Text hidden states [batch, text_len, hidden_size]
	modality_hidden: Modality features [batch, modality_len, hidden_size]
	modality_mask: Optional attention mask for modality
	past_key_value: Optional cached (key, value) for this layer
	use_cache: Whether to return updated KV cache
	output_attentions: Whether to return attention weights

	Returns:
	Tuple of (output, present_key_value, attention_weights)
	"""
	batch_size ,text_len ,_ =text_hidden .shape


	query =self .q_proj (text_hidden )
	query =query .view (batch_size ,text_len ,self .num_heads ,self .head_dim ).transpose (1 ,2 )


	if past_key_value is not None :

	key ,value =past_key_value
	else :

	modality_len =modality_hidden .shape [1 ]
	key =self .k_proj (modality_hidden )
	value =self .v_proj (modality_hidden )
	key =key .view (batch_size ,modality_len ,self .num_heads ,self .head_dim ).transpose (1 ,2 )
	value =value .view (batch_size ,modality_len ,self .num_heads ,self .head_dim ).transpose (1 ,2 )


	present_key_value =(key ,value )if use_cache else None


	attn_weights =None

	if self .use_flash_attention and not output_attentions :

	query_scaled =query *self .qk_scale
	key_scaled =key *self .qk_scale

	dropout_p =self .dropout_p if self .training else 0.0
	attn_output =F .scaled_dot_product_attention (
	query_scaled ,key_scaled ,value ,
	attn_mask =modality_mask ,
	dropout_p =dropout_p ,
	is_causal =False ,
	scale =1.0 ,
	)
	else :

	scale =1.0 /math .sqrt (self .head_dim )
	attn_weights =torch .matmul (query ,key .transpose (-2 ,-1 ))*scale

	if modality_mask is not None :
	attn_weights =attn_weights +modality_mask

	attn_weights =F .softmax (attn_weights ,dim =-1 ,dtype =text_hidden .dtype )

	if self .training and self .dropout_p >0 :
	attn_weights =F .dropout (attn_weights ,p =self .dropout_p )

	attn_output =torch .matmul (attn_weights ,value )


	attn_output =attn_output .transpose (1 ,2 ).contiguous ().view (batch_size ,text_len ,self .hidden_size )
	attn_output =self .o_proj (attn_output )


	gate =torch .sigmoid (self .gate )
	output =text_hidden +gate *self .dropout (attn_output )
	output =self .layer_norm (output )

	return output ,present_key_value ,attn_weights


	@dataclass
	class MultimodalFusionCache :
	"""Cache for multimodal fusion layer KV states."""
	image_kv :Tuple [torch .Tensor ,torch .Tensor ]=None
	video_kv :Tuple [torch .Tensor ,torch .Tensor ]=None
	audio_kv :Tuple [torch .Tensor ,torch .Tensor ]=None


	class MultimodalFusionLayer (nn .Module ):
	"""
	SOTA Multimodal fusion layer with cross-attention for all modalities and KV cache support.

	Features:
	- Separate cross-attention for each modality (image, video, audio)
	- KV caching for efficient generation
	- Gated fusion MLP
	- Flash Attention support
	"""

	def __init__ (
	self ,
	hidden_size :int ,
	num_heads :int =8 ,
	dropout :float =0.1 ,
	use_flash_attention :bool =True ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size


	self .image_cross_attn =MultimodalCrossAttention (
	hidden_size ,num_heads ,dropout ,use_flash_attention
	)
	self .video_cross_attn =MultimodalCrossAttention (
	hidden_size ,num_heads ,dropout ,use_flash_attention
	)
	self .audio_cross_attn =MultimodalCrossAttention (
	hidden_size ,num_heads ,dropout ,use_flash_attention
	)


	self .fusion_mlp =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size *4 ),
	nn .GELU (),
	nn .Dropout (dropout ),
	nn .Linear (hidden_size *4 ,hidden_size ),
	nn .Dropout (dropout ),
	)
	self .fusion_norm =nn .LayerNorm (hidden_size )

	def forward (
	self ,
	text_hidden :torch .Tensor ,
	image_hidden :torch .Tensor =None ,
	video_hidden :torch .Tensor =None ,
	audio_hidden :torch .Tensor =None ,
	image_mask :torch .Tensor =None ,
	video_mask :torch .Tensor =None ,
	audio_mask :torch .Tensor =None ,
	past_key_values :MultimodalFusionCache =None ,
	use_cache :bool =False ,
	)->Tuple [torch .Tensor ,MultimodalFusionCache ]:
	"""
	Fuse text with available modalities via cross-attention with KV cache support.

	Args:
	text_hidden: Text hidden states [batch, text_len, hidden_size]
	image_hidden: Image features [batch, image_len, hidden_size]
	video_hidden: Video features [batch, video_len, hidden_size]
	audio_hidden: Audio features [batch, audio_len, hidden_size]
	image_mask: Attention mask for image
	video_mask: Attention mask for video
	audio_mask: Attention mask for audio
	past_key_values: Cached KV states from previous forward pass
	use_cache: Whether to return updated KV cache

	Returns:
	Tuple of (output, present_key_values)
	"""
	present_key_values =MultimodalFusionCache ()if use_cache else None


	past_image_kv =past_key_values .image_kv if past_key_values else None
	past_video_kv =past_key_values .video_kv if past_key_values else None
	past_audio_kv =past_key_values .audio_kv if past_key_values else None


	if self ._has_content (image_hidden )or past_image_kv is not None :
	try :
	text_hidden ,image_kv ,_ =self .image_cross_attn (
	text_hidden ,
	image_hidden if image_hidden is not None else torch .zeros (text_hidden .shape [0 ],1 ,self .hidden_size ,device =text_hidden .device ),
	image_mask ,
	past_key_value =past_image_kv ,
	use_cache =use_cache ,
	)
	if use_cache :
	present_key_values .image_kv =image_kv
	except Exception as e :
	logger .debug (f"Image cross-attention skipped: {e }")


	if self ._has_content (video_hidden )or past_video_kv is not None :
	try :
	text_hidden ,video_kv ,_ =self .video_cross_attn (
	text_hidden ,
	video_hidden if video_hidden is not None else torch .zeros (text_hidden .shape [0 ],1 ,self .hidden_size ,device =text_hidden .device ),
	video_mask ,
	past_key_value =past_video_kv ,
	use_cache =use_cache ,
	)
	if use_cache :
	present_key_values .video_kv =video_kv
	except Exception as e :
	logger .debug (f"Video cross-attention skipped: {e }")


	if self ._has_content (audio_hidden )or past_audio_kv is not None :
	try :
	text_hidden ,audio_kv ,_ =self .audio_cross_attn (
	text_hidden ,
	audio_hidden if audio_hidden is not None else torch .zeros (text_hidden .shape [0 ],1 ,self .hidden_size ,device =text_hidden .device ),
	audio_mask ,
	past_key_value =past_audio_kv ,
	use_cache =use_cache ,
	)
	if use_cache :
	present_key_values .audio_kv =audio_kv
	except Exception as e :
	logger .debug (f"Audio cross-attention skipped: {e }")


	residual =text_hidden
	text_hidden =self .fusion_mlp (text_hidden )
	text_hidden =self .fusion_norm (residual +text_hidden )

	return text_hidden ,present_key_values

	@staticmethod
	def _has_content (tensor :torch .Tensor )->bool :
	"""Check if tensor has meaningful content."""
	if tensor is None :
	return False
	if not isinstance (tensor ,torch .Tensor ):
	return False
	try :
	if tensor .numel ()==0 :
	return False
	return bool (tensor .any ())
	except Exception :
	return False


	==============================================================================
	MODELS.COMPONENTS.PROJECTORS
	==============================================================================

	def compute_2d_rope (height :int ,width :int ,dim :int ,device :torch .device ,dtype :torch .dtype ,base :float =10000.0 )->Tuple [torch .Tensor ,torch .Tensor ]:
	"""
	Compute 2D Rotary Position Embeddings for spatial awareness.

	Args:
	height: Image height in patches
	width: Image width in patches
	dim: Embedding dimension (must be divisible by 4)
	device: Target device
	dtype: Target dtype
	base: RoPE base frequency

	Returns:
	cos, sin: [height*width, dim] position embeddings
	"""
	assert dim %4 ==0 ,"dim must be divisible by 4 for 2D RoPE"

	half_dim =dim //2
	quarter_dim =dim //4


	inv_freq =1.0 /(base **(torch .arange (0 ,quarter_dim ,device =device ,dtype =torch .float32 )/quarter_dim ))


	y_pos =torch .arange (height ,device =device ,dtype =torch .float32 )
	x_pos =torch .arange (width ,device =device ,dtype =torch .float32 )


	y_emb =torch .outer (y_pos ,inv_freq )
	x_emb =torch .outer (x_pos ,inv_freq )


	y_emb =y_emb .unsqueeze (1 ).expand (-1 ,width ,-1 )
	x_emb =x_emb .unsqueeze (0 ).expand (height ,-1 ,-1 )


	emb =torch .cat ([y_emb ,y_emb ,x_emb ,x_emb ],dim =-1 )
	emb =emb .reshape (height *width ,dim )

	return emb .cos ().to (dtype ),emb .sin ().to (dtype )


	def compute_3d_rope (
	depth :int ,height :int ,width :int ,dim :int ,
	device :torch .device ,dtype :torch .dtype ,base :float =10000.0
	)->Tuple [torch .Tensor ,torch .Tensor ]:
	"""
	Compute 3D Rotary Position Embeddings for video/temporal awareness.

	Args:
	depth: Temporal depth (number of frames)
	height: Image height in patches
	width: Image width in patches
	dim: Embedding dimension (must be divisible by 6)
	device: Target device
	dtype: Target dtype
	base: RoPE base frequency

	Returns:
	cos, sin: [depthheightwidth, dim] position embeddings
	"""
	assert dim %6 ==0 ,"dim must be divisible by 6 for 3D RoPE"

	sixth_dim =dim //6


	inv_freq =1.0 /(base **(torch .arange (0 ,sixth_dim ,device =device ,dtype =torch .float32 )/sixth_dim ))


	t_pos =torch .arange (depth ,device =device ,dtype =torch .float32 )
	y_pos =torch .arange (height ,device =device ,dtype =torch .float32 )
	x_pos =torch .arange (width ,device =device ,dtype =torch .float32 )


	t_emb =torch .outer (t_pos ,inv_freq )
	y_emb =torch .outer (y_pos ,inv_freq )
	x_emb =torch .outer (x_pos ,inv_freq )


	t_emb =t_emb .unsqueeze (1 ).unsqueeze (2 ).expand (-1 ,height ,width ,-1 )
	y_emb =y_emb .unsqueeze (0 ).unsqueeze (2 ).expand (depth ,-1 ,width ,-1 )
	x_emb =x_emb .unsqueeze (0 ).unsqueeze (1 ).expand (depth ,height ,-1 ,-1 )


	emb =torch .cat ([t_emb ,t_emb ,y_emb ,y_emb ,x_emb ,x_emb ],dim =-1 )
	emb =emb .reshape (depth height width ,dim )

	return emb .cos ().to (dtype ),emb .sin ().to (dtype )


	def apply_rope (x :torch .Tensor ,cos :torch .Tensor ,sin :torch .Tensor )->torch .Tensor :
	"""Apply rotary position embeddings."""
	x1 =x [...,:x .shape [-1 ]//2 ]
	x2 =x [...,x .shape [-1 ]//2 :]
	rotated =torch .cat ((-x2 ,x1 ),dim =-1 )
	return x cos +rotated sin


	class ResidualBottleneckBlock (nn .Module ):
	"""
	Residual Bottleneck Block for locality-enhanced feature extraction.

	Preserves small-scale features (OCR, fine audio events) during compression.
	"""

	def __init__ (self ,in_channels :int ,out_channels :int ,bottleneck_ratio :float =0.25 ):
	super ().__init__ ()
	bottleneck_channels =int (out_channels *bottleneck_ratio )

	self .conv1 =nn .Conv2d (in_channels ,bottleneck_channels ,1 ,bias =False )
	self .bn1 =nn .BatchNorm2d (bottleneck_channels )

	self .conv2 =nn .Conv2d (bottleneck_channels ,bottleneck_channels ,3 ,padding =1 ,bias =False )
	self .bn2 =nn .BatchNorm2d (bottleneck_channels )

	self .conv3 =nn .Conv2d (bottleneck_channels ,out_channels ,1 ,bias =False )
	self .bn3 =nn .BatchNorm2d (out_channels )


	self .shortcut =nn .Identity ()if in_channels ==out_channels else nn .Sequential (
	nn .Conv2d (in_channels ,out_channels ,1 ,bias =False ),
	nn .BatchNorm2d (out_channels ),
	)

	self .relu =nn .ReLU (inplace =True )

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	identity =self .shortcut (x )

	out =self .relu (self .bn1 (self .conv1 (x )))
	out =self .relu (self .bn2 (self .conv2 (out )))
	out =self .bn3 (self .conv3 (out ))

	out =out +identity
	out =self .relu (out )

	return out


	class LocalityEnhancedResNetAbstractor (nn .Module ):
	"""
	Locality-Enhanced ResNet Abstractor.

	Upgrades the C-Abstractor with residual bottleneck blocks to preserve
	small-scale features (OCR/fine audio events) during compression.
	"""

	def __init__ (
	self ,
	input_dim :int ,
	output_dim :int ,
	num_tokens :int =64 ,
	num_blocks :int =3 ,
	use_2d_rope :bool =True ,
	):
	super ().__init__ ()
	self .num_tokens =num_tokens
	self .use_2d_rope =use_2d_rope


	self .input_proj =nn .Linear (input_dim ,output_dim )


	self .blocks =nn .ModuleList ([
	ResidualBottleneckBlock (output_dim ,output_dim )
	for _ in range (num_blocks )
	])


	self .queries =nn .Parameter (torch .randn (1 ,num_tokens ,output_dim )*0.02 )


	self .cross_attn =nn .MultiheadAttention (
	embed_dim =output_dim ,
	num_heads =8 ,
	batch_first =True ,
	dropout =0.1 ,
	)


	self .ff =nn .Sequential (
	nn .LayerNorm (output_dim ),
	nn .Linear (output_dim ,output_dim *4 ),
	nn .GELU (),
	nn .Linear (output_dim *4 ,output_dim ),
	)

	self .norm =nn .LayerNorm (output_dim )

	print (f" 🏗️ LocalityEnhancedResNetAbstractor: {input_dim } -> {output_dim }, {num_tokens } tokens")

	def forward (self ,features :torch .Tensor ,spatial_size :Optional [Tuple [int ,int ]]=None )->torch .Tensor :
	"""
	Args:
	features: [B, seq_len, input_dim] or [B, H, W, input_dim]
	spatial_size: (H, W) if features are flattened

	Returns:
	abstracted: [B, num_tokens, output_dim]
	"""
	batch_size =features .shape [0 ]


	x =self .input_proj (features )


	if features .dim ()==3 :
	seq_len =features .shape [1 ]
	if spatial_size is None :
	h =w =int (math .sqrt (seq_len ))
	else :
	h ,w =spatial_size
	x =x .view (batch_size ,h ,w ,-1 )
	else :
	h ,w =features .shape [1 ],features .shape [2 ]


	x =x .permute (0 ,3 ,1 ,2 )
	for block in self .blocks :
	x =block (x )
	x =x .permute (0 ,2 ,3 ,1 )


	x =x .reshape (batch_size ,h *w ,-1 )


	if self .use_2d_rope :
	cos ,sin =compute_2d_rope (h ,w ,x .shape [-1 ],x .device ,x .dtype )
	x =apply_rope (x ,cos .unsqueeze (0 ),sin .unsqueeze (0 ))


	queries =self .queries .expand (batch_size ,-1 ,-1 )
	abstracted ,_ =self .cross_attn (queries ,x ,x )


	abstracted =abstracted +self .ff (abstracted )

	return self .norm (abstracted )


	class MultiScaleFeatureFusion (nn .Module ):
	"""
	Multi-Scale Feature Fusion (MSFF).

	Extracts and weights features from multiple encoder depths (early, mid, late)
	to capture both low-level textures and high-level semantics.
	"""

	def __init__ (
	self ,
	feature_dims :List [int ],
	output_dim :int ,
	num_scales :int =3 ,
	):
	super ().__init__ ()
	self .num_scales =num_scales


	self .scale_projs =nn .ModuleList ([
	nn .Linear (dim ,output_dim )for dim in feature_dims
	])


	self .scale_weights =nn .Parameter (torch .ones (num_scales )/num_scales )


	self .fusion =nn .Sequential (
	nn .Linear (output_dim ,output_dim *2 ),
	nn .GELU (),
	nn .Linear (output_dim *2 ,output_dim ),
	)

	self .norm =nn .LayerNorm (output_dim )

	print (f" 🔀 MultiScaleFeatureFusion: {feature_dims } -> {output_dim }")

	def forward (self ,multi_scale_features :List [torch .Tensor ])->torch .Tensor :
	"""
	Args:
	multi_scale_features: List of [B, seq_len, dim] features from different depths

	Returns:
	fused: [B, seq_len, output_dim]
	"""
	assert len (multi_scale_features )==self .num_scales


	projected =[]
	for i ,(features ,proj )in enumerate (zip (multi_scale_features ,self .scale_projs )):
	projected .append (proj (features ))


	weights =F .softmax (self .scale_weights ,dim =0 )
	fused =sum (w *p for w ,p in zip (weights ,projected ))


	fused =fused +self .fusion (fused )

	return self .norm (fused )


	class MultiScaleDeformableAttention (nn .Module ):
	"""
	Multi-Scale Deformable Attention.

	Replaces fixed-grid cross-attention in Perceiver Resamplers,
	allowing the projector to "look" at non-uniform regions of interest.
	"""

	def __init__ (
	self ,
	dim :int ,
	num_heads :int =8 ,
	num_levels :int =4 ,
	num_points :int =4 ,
	dropout :float =0.1 ,
	):
	super ().__init__ ()
	self .dim =dim
	self .num_heads =num_heads
	self .num_levels =num_levels
	self .num_points =num_points
	self .head_dim =dim //num_heads


	self .sampling_offsets =nn .Linear (dim ,num_heads num_levels num_points *2 )


	self .attention_weights =nn .Linear (dim ,num_heads num_levels num_points )


	self .value_proj =nn .Linear (dim ,dim )


	self .output_proj =nn .Linear (dim ,dim )

	self .dropout =nn .Dropout (dropout )

	self ._reset_parameters ()

	print (f" 🎯 MultiScaleDeformableAttention: {dim }d, {num_heads }H, {num_levels }L, {num_points }P")

	def _reset_parameters (self ):
	nn .init .constant_ (self .sampling_offsets .weight ,0.0 )
	nn .init .constant_ (self .sampling_offsets .bias ,0.0 )
	nn .init .xavier_uniform_ (self .attention_weights .weight )
	nn .init .constant_ (self .attention_weights .bias ,0.0 )
	nn .init .xavier_uniform_ (self .value_proj .weight )
	nn .init .xavier_uniform_ (self .output_proj .weight )

	def forward (
	self ,
	query :torch .Tensor ,
	reference_points :torch .Tensor ,
	input_flatten :torch .Tensor ,
	input_spatial_shapes :torch .Tensor ,
	)->torch .Tensor :
	"""
	Args:
	query: [B, num_queries, dim]
	reference_points: [B, num_queries, num_levels, 2] normalized reference points
	input_flatten: [B, sum(H*W), dim] flattened multi-scale features
	input_spatial_shapes: [num_levels, 2] spatial shapes of each level

	Returns:
	output: [B, num_queries, dim]
	"""
	batch_size ,num_queries ,_ =query .shape


	offsets =self .sampling_offsets (query )
	offsets =offsets .view (batch_size ,num_queries ,self .num_heads ,self .num_levels ,self .num_points ,2 )


	attn_weights =self .attention_weights (query )
	attn_weights =attn_weights .view (batch_size ,num_queries ,self .num_heads ,self .num_levels *self .num_points )
	attn_weights =F .softmax (attn_weights ,dim =-1 )
	attn_weights =attn_weights .view (batch_size ,num_queries ,self .num_heads ,self .num_levels ,self .num_points )


	sampling_locations =reference_points .unsqueeze (2 ).unsqueeze (4 )+offsets *0.1
	sampling_locations =sampling_locations .clamp (0 ,1 )


	value =self .value_proj (input_flatten )
	value =value .view (batch_size ,-1 ,self .num_heads ,self .head_dim )



	output =torch .zeros (batch_size ,num_queries ,self .num_heads ,self .head_dim ,device =query .device ,dtype =query .dtype )

	start_idx =0
	for level_idx in range (self .num_levels ):
	h ,w =input_spatial_shapes [level_idx ]
	end_idx =start_idx +h *w

	level_value =value [:,start_idx :end_idx ]
	level_value =level_value .view (batch_size ,h ,w ,self .num_heads ,self .head_dim )


	level_locs =sampling_locations [:,:,:,level_idx ]
	level_weights =attn_weights [:,:,:,level_idx ]


	for point_idx in range (self .num_points ):
	loc =level_locs [:,:,:,point_idx ]
	weight =level_weights [:,:,:,point_idx :point_idx +1 ]


	y_idx =(loc [...,0 ]*(h -1 )).long ().clamp (0 ,h -1 )
	x_idx =(loc [...,1 ]*(w -1 )).long ().clamp (0 ,w -1 )


	for b in range (batch_size ):
	for q in range (num_queries ):
	for head in range (self .num_heads ):
	y ,x =y_idx [b ,q ,head ].item (),x_idx [b ,q ,head ].item ()
	output [b ,q ,head ]+=weight [b ,q ,head ]*level_value [b ,y ,x ,head ]

	start_idx =end_idx


	output =output .view (batch_size ,num_queries ,self .dim )
	output =self .output_proj (output )
	output =self .dropout (output )

	return output


	class DynamicTokenRouter (nn .Module ):
	"""
	Dynamic Token Router.

	Implements a sparse gating mechanism to drop redundant "background" tokens,
	drastically reducing KV-cache pressure for Ring Attention.
	"""

	def __init__ (
	self ,
	dim :int ,
	num_tokens :int ,
	keep_ratio :float =0.5 ,
	temperature :float =1.0 ,
	):
	super ().__init__ ()
	self .dim =dim
	self .num_tokens =num_tokens
	self .keep_ratio =keep_ratio
	self .temperature =temperature


	self .scorer =nn .Sequential (
	nn .Linear (dim ,dim //2 ),
	nn .GELU (),
	nn .Linear (dim //2 ,1 ),
	)


	self .threshold =nn .Parameter (torch .tensor (0.0 ))

	print (f" 🚦 DynamicTokenRouter: keep_ratio={keep_ratio }")

	def forward (self ,tokens :torch .Tensor ,return_mask :bool =False )->Tuple [torch .Tensor ,Optional [torch .Tensor ]]:
	"""
	Args:
	tokens: [B, num_tokens, dim]
	return_mask: Whether to return the selection mask

	Returns:
	selected_tokens: [B, num_kept, dim]
	mask: [B, num_tokens] selection mask (if return_mask=True)
	"""
	batch_size ,num_tokens ,_ =tokens .shape
	num_keep =max (1 ,int (num_tokens *self .keep_ratio ))


	scores =self .scorer (tokens ).squeeze (-1 )


	scores =scores /self .temperature


	_ ,indices =torch .topk (scores ,num_keep ,dim =-1 )
	indices =indices .sort (dim =-1 ).values


	indices_expanded =indices .unsqueeze (-1 ).expand (-1 ,-1 ,self .dim )
	selected_tokens =torch .gather (tokens ,1 ,indices_expanded )

	if return_mask :
	mask =torch .zeros (batch_size ,num_tokens ,device =tokens .device ,dtype =torch .bool )
	mask .scatter_ (1 ,indices ,True )
	return selected_tokens ,mask

	return selected_tokens ,None


	class PerceiverAttention (nn .Module ):
	"""
	Perceiver-style cross-attention for resampling with 2D/3D RoPE support.
	"""

	def __init__ (
	self ,
	dim :int ,
	num_heads :int =8 ,
	dim_head :int =64 ,
	dropout :float =0.0 ,
	use_rope :bool =True ,
	):
	super ().__init__ ()
	inner_dim =dim_head *num_heads
	self .num_heads =num_heads
	self .dim_head =dim_head
	self .inner_dim =inner_dim
	self .scale =dim_head **-0.5
	self .use_rope =use_rope

	self .norm_latents =nn .LayerNorm (dim )
	self .norm_context =nn .LayerNorm (dim )

	self .to_q =nn .Linear (dim ,inner_dim ,bias =False )
	self .to_kv =nn .Linear (dim ,inner_dim *2 ,bias =False )
	self .to_out =nn .Sequential (
	nn .Linear (inner_dim ,dim ),
	nn .Dropout (dropout )
	)

	def forward (
	self ,
	latents :torch .Tensor ,
	context :torch .Tensor ,
	context_rope :Optional [Tuple [torch .Tensor ,torch .Tensor ]]=None ,
	)->torch .Tensor :
	"""
	latents: [B, num_latents, dim] - learnable queries
	context: [B, seq_len, dim] - input features to attend to
	context_rope: Optional (cos, sin) for context positions
	"""
	latents =self .norm_latents (latents )
	context =self .norm_context (context )

	b ,n ,_ =latents .shape
	ctx_len =context .shape [1 ]
	h =self .num_heads
	d =self .dim_head

	q =self .to_q (latents )
	kv =self .to_kv (context ).chunk (2 ,dim =-1 )
	k ,v =kv


	q =q .reshape (b ,n ,h ,d ).transpose (1 ,2 )
	k =k .reshape (b ,ctx_len ,h ,d ).transpose (1 ,2 )
	v =v .reshape (b ,ctx_len ,h ,d ).transpose (1 ,2 )


	if self .use_rope and context_rope is not None :
	cos ,sin =context_rope
	cos =cos .unsqueeze (0 ).unsqueeze (0 )
	sin =sin .unsqueeze (0 ).unsqueeze (0 )
	k =apply_rope (k ,cos ,sin )


	qk_scale =d **-0.25
	out =F .scaled_dot_product_attention (
	q qk_scale ,k qk_scale ,v ,
	is_causal =False ,scale =1.0 ,
	)
	out =out .transpose (1 ,2 ).reshape (b ,n ,self .inner_dim )

	return self .to_out (out )


	class PerceiverResampler (nn .Module ):
	"""
	Perceiver Resampler with 2D/3D RoPE and Dynamic Token Routing.
	"""

	def __init__ (
	self ,
	input_dim :int ,
	output_dim :int ,
	num_latents :int =64 ,
	num_heads :int =8 ,
	num_layers :int =2 ,
	dropout :float =0.0 ,
	use_rope :bool =True ,
	use_dynamic_routing :bool =False ,
	routing_keep_ratio :float =0.5 ,
	):
	super ().__init__ ()
	self .num_latents =num_latents
	self .use_rope =use_rope
	self .use_dynamic_routing =use_dynamic_routing


	self .input_proj =nn .Linear (input_dim ,output_dim )if input_dim !=output_dim else nn .Identity ()


	self .latents =nn .Parameter (torch .randn (1 ,num_latents ,output_dim )*0.02 )


	self .layers =nn .ModuleList ([
	nn .ModuleList ([
	PerceiverAttention (output_dim ,num_heads ,output_dim //num_heads ,dropout ,use_rope ),
	nn .Sequential (
	nn .LayerNorm (output_dim ),
	nn .Linear (output_dim ,output_dim *4 ),
	nn .GELU (),
	nn .Dropout (dropout ),
	nn .Linear (output_dim *4 ,output_dim ),
	nn .Dropout (dropout ),
	)
	])
	for _ in range (num_layers )
	])


	if use_dynamic_routing :
	self .token_router =DynamicTokenRouter (output_dim ,num_latents ,routing_keep_ratio )
	else :
	self .token_router =None

	self .norm_out =nn .LayerNorm (output_dim )

	def forward (
	self ,
	x :torch .Tensor ,
	spatial_size :Optional [Tuple [int ,int ]]=None ,
	temporal_size :Optional [int ]=None ,
	)->torch .Tensor :
	"""
	x: [B, seq_len, input_dim] - input features
	spatial_size: (H, W) for 2D RoPE
	temporal_size: T for 3D RoPE (video)
	returns: [B, num_latents, output_dim] - compressed features
	"""
	batch_size =x .shape [0 ]


	x =self .input_proj (x )


	context_rope =None
	if self .use_rope and spatial_size is not None :
	h ,w =spatial_size
	if temporal_size is not None :

	cos ,sin =compute_3d_rope (temporal_size ,h ,w ,x .shape [-1 ],x .device ,x .dtype )
	else :

	cos ,sin =compute_2d_rope (h ,w ,x .shape [-1 ],x .device ,x .dtype )
	context_rope =(cos ,sin )


	latents =self .latents .expand (batch_size ,-1 ,-1 )


	for attn ,ff in self .layers :
	latents =latents +attn (latents ,x ,context_rope )
	latents =latents +ff (latents )

	latents =self .norm_out (latents )


	if self .token_router is not None :
	latents ,_ =self .token_router (latents )

	return latents


	class SpatialAwareProjector (nn .Module ):
	"""
	Spatial-aware projector with 2D RoPE.
	"""

	def __init__ (
	self ,
	vision_hidden_size :int ,
	llm_hidden_size :int ,
	num_tokens :int =64 ,
	spatial_pool_size :int =8 ,
	use_rope :bool =True ,
	):
	super ().__init__ ()
	self .num_tokens =num_tokens
	self .spatial_pool_size =spatial_pool_size
	self .use_rope =use_rope


	self .spatial_conv =nn .Sequential (
	nn .Conv2d (vision_hidden_size ,llm_hidden_size ,3 ,padding =1 ),
	nn .GELU (),
	nn .Conv2d (llm_hidden_size ,llm_hidden_size ,3 ,padding =1 ),
	nn .GELU (),
	)


	self .adaptive_pool =nn .AdaptiveAvgPool2d ((spatial_pool_size ,spatial_pool_size ))


	self .proj =nn .Sequential (
	nn .Linear (llm_hidden_size ,llm_hidden_size ),
	nn .GELU (),
	nn .Linear (llm_hidden_size ,llm_hidden_size ),
	)

	self .norm =nn .LayerNorm (llm_hidden_size )

	def forward (self ,vision_features :torch .Tensor ,spatial_size :Optional [Tuple [int ,int ]]=None )->torch .Tensor :
	batch_size =vision_features .shape [0 ]


	if vision_features .dim ()==3 :
	seq_len =vision_features .shape [1 ]
	if spatial_size is None :
	h =w =int (math .sqrt (seq_len ))
	else :
	h ,w =spatial_size
	vision_features =vision_features .view (batch_size ,h ,w ,-1 )


	x =vision_features .permute (0 ,3 ,1 ,2 )


	x =self .spatial_conv (x )


	x =self .adaptive_pool (x )


	x =x .flatten (2 ).transpose (1 ,2 )


	if self .use_rope :
	cos ,sin =compute_2d_rope (self .spatial_pool_size ,self .spatial_pool_size ,x .shape [-1 ],x .device ,x .dtype )
	x =apply_rope (x ,cos .unsqueeze (0 ),sin .unsqueeze (0 ))


	x =self .proj (x )
	x =self .norm (x )

	return x


	class CAbstractor (nn .Module ):
	"""
	C-Abstractor: Compressed Abstraction for efficient multimodal fusion.
	Now with 2D RoPE support.
	"""

	def __init__ (
	self ,
	vision_hidden_size :int ,
	llm_hidden_size :int ,
	num_tokens :int =64 ,
	num_heads :int =8 ,
	compression_ratio :int =4 ,
	use_rope :bool =True ,
	):
	super ().__init__ ()
	self .num_tokens =num_tokens
	self .use_rope =use_rope


	self .input_proj =nn .Linear (vision_hidden_size ,llm_hidden_size )


	self .compress =nn .Sequential (
	nn .Conv1d (llm_hidden_size ,llm_hidden_size ,kernel_size =compression_ratio ,stride =compression_ratio ),
	nn .GELU (),
	)


	self .queries =nn .Parameter (torch .randn (1 ,num_tokens ,llm_hidden_size )*0.02 )


	self .cross_attn =nn .MultiheadAttention (
	embed_dim =llm_hidden_size ,
	num_heads =num_heads ,
	batch_first =True ,
	dropout =0.1 ,
	)


	self .ff =nn .Sequential (
	nn .LayerNorm (llm_hidden_size ),
	nn .Linear (llm_hidden_size ,llm_hidden_size *4 ),
	nn .GELU (),
	nn .Linear (llm_hidden_size *4 ,llm_hidden_size ),
	)

	self .norm =nn .LayerNorm (llm_hidden_size )

	def forward (self ,vision_features :torch .Tensor ,spatial_size :Optional [Tuple [int ,int ]]=None )->torch .Tensor :
	batch_size =vision_features .shape [0 ]


	x =self .input_proj (vision_features )


	if self .use_rope and spatial_size is not None :
	h ,w =spatial_size
	cos ,sin =compute_2d_rope (h ,w ,x .shape [-1 ],x .device ,x .dtype )
	x =apply_rope (x ,cos .unsqueeze (0 ),sin .unsqueeze (0 ))


	x =x .transpose (1 ,2 )
	x =self .compress (x )
	x =x .transpose (1 ,2 )


	queries =self .queries .expand (batch_size ,-1 ,-1 )
	abstracted ,_ =self .cross_attn (queries ,x ,x )


	abstracted =abstracted +self .ff (abstracted )

	return self .norm (abstracted )


	class MultimodalProjector (nn .Module ):
	"""
	SOTA Multimodal Projector with all advanced features.

	Combines:
	- Locality-Enhanced ResNet Abstractor
	- Multi-Scale Feature Fusion
	- Multi-Scale Deformable Attention
	- Dynamic Token Router
	- 2D/3D RoPE
	- Perceiver Resampler
	"""

	def __init__ (
	self ,
	vision_hidden_size :int ,
	llm_hidden_size :int ,
	num_tokens :int =64 ,
	projector_type :str ="perceiver",
	num_heads :int =8 ,
	num_layers :int =2 ,
	use_rope :bool =True ,
	use_dynamic_routing :bool =False ,
	use_locality_enhanced :bool =False ,
	use_msff :bool =False ,
	use_deformable_attn :bool =False ,
	):
	super ().__init__ ()
	self .num_tokens =num_tokens
	self .projector_type =projector_type
	self .use_rope =use_rope

	if projector_type =="perceiver":
	self .projector =PerceiverResampler (
	input_dim =vision_hidden_size ,
	output_dim =llm_hidden_size ,
	num_latents =num_tokens ,
	num_heads =num_heads ,
	num_layers =num_layers ,
	use_rope =use_rope ,
	use_dynamic_routing =use_dynamic_routing ,
	)
	elif projector_type =="spatial":
	self .projector =SpatialAwareProjector (
	vision_hidden_size =vision_hidden_size ,
	llm_hidden_size =llm_hidden_size ,
	num_tokens =num_tokens ,
	use_rope =use_rope ,
	)
	elif projector_type =="c_abstractor":
	self .projector =CAbstractor (
	vision_hidden_size =vision_hidden_size ,
	llm_hidden_size =llm_hidden_size ,
	num_tokens =num_tokens ,
	num_heads =num_heads ,
	use_rope =use_rope ,
	)
	elif projector_type =="locality_enhanced":
	self .projector =LocalityEnhancedResNetAbstractor (
	input_dim =vision_hidden_size ,
	output_dim =llm_hidden_size ,
	num_tokens =num_tokens ,
	use_2d_rope =use_rope ,
	)
	else :
	self .projector =nn .Sequential (
	nn .Linear (vision_hidden_size ,llm_hidden_size ),
	nn .GELU (),
	nn .Linear (llm_hidden_size ,llm_hidden_size ),
	)
	self .query_tokens =nn .Parameter (torch .randn (1 ,num_tokens ,llm_hidden_size )*0.02 )
	self .cross_attn =nn .MultiheadAttention (
	embed_dim =llm_hidden_size ,
	num_heads =num_heads ,
	batch_first =True
	)
	self .norm =nn .LayerNorm (llm_hidden_size )


	if use_msff :
	self .msff =MultiScaleFeatureFusion (
	feature_dims =[vision_hidden_size ]*3 ,
	output_dim =vision_hidden_size ,
	)
	else :
	self .msff =None


	if use_deformable_attn :
	self .deformable_attn =MultiScaleDeformableAttention (
	dim =llm_hidden_size ,
	num_heads =num_heads ,
	)
	else :
	self .deformable_attn =None


	if use_dynamic_routing and projector_type !="perceiver":
	self .token_router =DynamicTokenRouter (llm_hidden_size ,num_tokens )
	else :
	self .token_router =None

	def forward (
	self ,
	vision_features :torch .Tensor ,
	multi_scale_features :Optional [List [torch .Tensor ]]=None ,
	spatial_size :Optional [Tuple [int ,int ]]=None ,
	temporal_size :Optional [int ]=None ,
	)->torch .Tensor :
	"""Project and resample vision features."""


	if self .msff is not None and multi_scale_features is not None :
	vision_features =self .msff (multi_scale_features )

	if self .projector_type in ["perceiver"]:
	output =self .projector (vision_features ,spatial_size ,temporal_size )
	elif self .projector_type in ["spatial","c_abstractor","locality_enhanced"]:
	output =self .projector (vision_features ,spatial_size )
	else :

	batch_size =vision_features .shape [0 ]
	projected =self .projector (vision_features )
	queries =self .query_tokens .expand (batch_size ,-1 ,-1 )
	resampled ,_ =self .cross_attn (queries ,projected ,projected )
	output =self .norm (resampled )


	if self .token_router is not None :
	output ,_ =self .token_router (output )

	return output


	==============================================================================
	MODELS.COMPONENTS.MOE
	==============================================================================

	EPS =1e-5


	class ExpertUtilizationTracker :
	"""
	Tracks expert utilization across MoE layers.

	Attach to any MoE layer to log per-expert usage histograms.
	Every `report_interval` steps, prints a report showing:
	- Frequency of use per expert
	- Cold experts (used < 1% of tokens)
	- Count of experts offloaded to CPU (if ExpertOffloadManager is available)

	Usage:
	tracker = ExpertUtilizationTracker(num_experts=8, layer_name="layer.3.moe")


	"""

	def __init__ (
	self ,
	num_experts :int ,
	layer_name :str ="moe",
	report_interval :int =100 ,
	cold_threshold_pct :float =1.0 ,
	):
	self .num_experts =num_experts
	self .layer_name =layer_name
	self .report_interval =report_interval
	self .cold_threshold_pct =cold_threshold_pct

	self ._counts =torch .zeros (num_experts ,dtype =torch .long )
	self ._total_tokens =0
	self ._step =0
	self ._offload_manager =None

	def link_offload_manager (self ,manager ):
	"""Link an ExpertOffloadManager for cold-expert reporting."""
	self ._offload_manager =manager

	def record (self ,expert_indices :torch .Tensor ):
	"""
	Record expert selections from a forward pass.

	Args:
	expert_indices: [num_tokens, top_k] tensor of selected expert indices
	"""
	indices_flat =expert_indices .detach ().cpu ().reshape (-1 )
	for idx in range (self .num_experts ):
	self ._counts [idx ]+=(indices_flat ==idx ).sum ().item ()
	self ._total_tokens +=expert_indices .shape [0 ]

	def step (self ):
	"""Advance step counter. Prints report and resets when interval is hit."""
	self ._step +=1
	if self ._step %self .report_interval ==0 :
	self ._print_report ()
	self ._reset ()

	def _reset (self ):
	"""Reset accumulators for next interval."""
	self ._counts .zero_ ()
	self ._total_tokens =0

	def _print_report (self ):
	"""Print expert utilization histogram."""
	if self ._total_tokens ==0 :
	return

	freqs =self ._counts .float ()
	total_assignments =freqs .sum ().item ()
	if total_assignments ==0 :
	return

	pcts =(freqs /total_assignments *100 ).tolist ()


	cold_experts =[i for i ,p in enumerate (pcts )if p <self .cold_threshold_pct ]


	max_pct =max (pcts )if pcts else 0
	bar_max =30

	lines =[f"\n{'='*60 }"]
	lines .append (f" Expert Utilization — {self .layer_name } (step {self ._step })")
	lines .append (f" {self ._total_tokens :,} tokens, {int (total_assignments ):,} assignments")
	lines .append (f"{'─'*60 }")

	for i ,pct in enumerate (pcts ):
	bar_len =int (pct /max_pct *bar_max )if max_pct >0 else 0
	bar ="█"*bar_len
	cold_tag =" ❄️"if pct <self .cold_threshold_pct else ""
	lines .append (f" Expert {i :2d} │{bar :<{bar_max }}│ {pct :5.1f}% ({int (self ._counts [i ]):>6d}){cold_tag }")

	lines .append (f"{'─'*60 }")

	if cold_experts :
	lines .append (f" ❄️ Cold experts (<{self .cold_threshold_pct }%): {cold_experts }")
	else :
	lines .append (f" ✅ All experts active (no cold experts)")


	if self ._offload_manager is not None :
	status =self ._offload_manager .get_status ()
	lines .append (f" 💾 Offloaded to CPU: {status ['cpu']}/{status ['total']}")


	ideal_pct =100.0 /self .num_experts
	balance =1.0 -(sum (abs (p -ideal_pct )for p in pcts )/(2 *100 ))
	lines .append (f" ⚖️ Load balance score: {balance :.3f} (1.0 = perfect)")

	lines .append (f"{'='*60 }")
	print ("\n".join (lines ))

	def get_stats (self )->dict :
	"""Return current stats as a dict (for programmatic access)."""
	total =self ._counts .sum ().item ()
	if total ==0 :
	pcts =[0.0 ]*self .num_experts
	else :
	pcts =(self ._counts .float ()/total *100 ).tolist ()

	cold =[i for i ,p in enumerate (pcts )if p <self .cold_threshold_pct ]
	ideal_pct =100.0 /self .num_experts
	balance =1.0 -(sum (abs (p -ideal_pct )for p in pcts )/(2 *100 ))if total >0 else 0.0

	return {
	"step":self ._step ,
	"layer_name":self .layer_name ,
	"total_tokens":self ._total_tokens ,
	"expert_counts":self ._counts .tolist (),
	"expert_pcts":pcts ,
	"cold_experts":cold ,
	"balance_score":balance ,
	}


	def attach_utilization_trackers (
	model :torch .nn .Module ,
	report_interval :int =100 ,
	)->list :
	"""
	Find all MoE layers in a model and attach ExpertUtilizationTrackers.

	Returns list of trackers for manual step() calls in the training loop.
	"""
	trackers =[]
	for name ,module in model .named_modules ():
	if hasattr (module ,'experts')and hasattr (module ,'router'):
	num_experts =len (module .experts )
	tracker =ExpertUtilizationTracker (
	num_experts =num_experts ,
	layer_name =name ,
	report_interval =report_interval ,
	)

	if hasattr (module ,'_expert_offload_manager'):
	tracker .link_offload_manager (module ._expert_offload_manager )

	module ._utilization_tracker =tracker
	trackers .append (tracker )

	if trackers :
	print (f" 📊 Attached {len (trackers )} expert utilization trackers (report every {report_interval } steps)")

	return trackers


	class MoERouter (nn .Module ):
	"""
	SOTA Router for Mixture of Experts v2.0 - FP16 native.

	Supports both traditional aux-loss routing and aux-lossless routing.
	"""

	def __init__ (self ,hidden_size :int ,num_experts :int ,top_k :int =2 ,
	noise_std :float =0.01 ,capacity_factor :float =1.25 ,
	aux_lossless :bool =True ):
	super ().__init__ ()
	self .num_experts =num_experts
	self .top_k =top_k
	self .noise_std =noise_std
	self .capacity_factor =capacity_factor
	self .hidden_size =hidden_size
	self .aux_lossless =aux_lossless


	self .input_norm =nn .LayerNorm (hidden_size ,eps =1e-5 )
	self .gate =nn .Linear (hidden_size ,num_experts ,bias =False )

	nn .init .normal_ (self .gate .weight ,mean =0.0 ,std =0.01 )

	if aux_lossless :

	self .expert_bias =nn .Parameter (torch .zeros (num_experts ))

	def forward (self ,hidden_states :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ,torch .Tensor ]:
	batch_size ,seq_len ,hidden_dim =hidden_states .shape
	hidden_flat =hidden_states .view (-1 ,hidden_dim )


	hidden_norm =self .input_norm (hidden_flat )

	router_logits =self .gate (hidden_norm )


	if self .aux_lossless :
	router_logits =router_logits +self .expert_bias

	if self .training and self .noise_std >0 :
	noise =torch .randn_like (router_logits )*self .noise_std
	noisy_logits =router_logits +noise
	else :
	noisy_logits =router_logits

	router_probs =F .softmax (noisy_logits ,dim =-1 ,dtype =hidden_states .dtype )

	top_k_probs ,top_k_indices =torch .topk (router_probs ,self .top_k ,dim =-1 )


	prob_sum =top_k_probs .sum (dim =-1 ,keepdim =True ).clamp (min =EPS )
	top_k_probs =top_k_probs /prob_sum

	return top_k_probs ,top_k_indices ,router_logits


	class MoEExpert (nn .Module ):
	"""
	Single expert FFN with SwiGLU activation - FP16 native.
	"""

	def __init__ (self ,hidden_size :int ,intermediate_size :int ,dropout :float =0.0 ):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .intermediate_size =intermediate_size
	self .gate_proj =nn .Linear (hidden_size ,intermediate_size ,bias =False )
	self .up_proj =nn .Linear (hidden_size ,intermediate_size ,bias =False )
	self .down_proj =nn .Linear (intermediate_size ,hidden_size ,bias =False )
	self .act_fn =nn .SiLU ()
	self .dropout =nn .Dropout (dropout )if dropout >0 else nn .Identity ()
	self ._init_weights ()

	def _init_weights (self ):

	std =0.02
	nn .init .normal_ (self .gate_proj .weight ,mean =0.0 ,std =std )
	nn .init .normal_ (self .up_proj .weight ,mean =0.0 ,std =std )
	nn .init .normal_ (self .down_proj .weight ,mean =0.0 ,std =std *0.5 )

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	gate =self .act_fn (self .gate_proj (x ))
	up =self .up_proj (x )
	out =self .down_proj (gate *up )
	return self .dropout (out )


	class SharedExpert (nn .Module ):
	"""
	Isolated Shared Expert (v2.0) - FP16 native.

	Always active, separate from routed experts.
	The shared expert processes all tokens independently of routing decisions.
	"""

	def __init__ (self ,hidden_size :int ,intermediate_size :int ,dropout :float =0.0 ,
	isolated :bool =True ):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .intermediate_size =intermediate_size
	self .isolated =isolated

	self .gate_proj =nn .Linear (hidden_size ,intermediate_size ,bias =False )
	self .up_proj =nn .Linear (hidden_size ,intermediate_size ,bias =False )
	self .down_proj =nn .Linear (intermediate_size ,hidden_size ,bias =False )
	self .act_fn =nn .SiLU ()
	self .dropout =nn .Dropout (dropout )if dropout >0 else nn .Identity ()


	self .shared_gate =nn .Parameter (torch .ones (1 )*0.5 )

	if isolated :

	self .pre_norm =nn .LayerNorm (hidden_size ,eps =1e-5 )

	self ._init_weights ()

	def _init_weights (self ):
	std =0.02
	nn .init .normal_ (self .gate_proj .weight ,mean =0.0 ,std =std )
	nn .init .normal_ (self .up_proj .weight ,mean =0.0 ,std =std )
	nn .init .normal_ (self .down_proj .weight ,mean =0.0 ,std =std *0.5 )

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	if self .isolated :
	x =self .pre_norm (x )

	gate =self .act_fn (self .gate_proj (x ))
	up =self .up_proj (x )
	out =self .down_proj (gate *up )
	out =self .dropout (out )
	return out *torch .sigmoid (self .shared_gate )


	class MoELayer (nn .Module ):
	"""
	SOTA Mixture of Experts layer v2.0 - FP16 native.

	Supports Aux-Lossless MoE with Isolated Shared Expert.
	"""

	def __init__ (
	self ,
	hidden_size :int ,
	intermediate_size :int ,
	num_experts :int =8 ,
	num_experts_per_tok :int =2 ,
	use_shared_expert :bool =True ,
	shared_expert_intermediate_size :Optional [int ]=None ,
	capacity_factor :float =1.25 ,
	expert_dropout :float =0.0 ,
	aux_lossless :bool =True ,
	isolated_shared :bool =True ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_experts =num_experts
	self .num_experts_per_tok =num_experts_per_tok
	self .use_shared_expert =use_shared_expert
	self .capacity_factor =capacity_factor
	self .aux_lossless =aux_lossless

	self .router =MoERouter (
	hidden_size ,num_experts ,num_experts_per_tok ,
	capacity_factor =capacity_factor ,aux_lossless =aux_lossless
	)
	self .experts =nn .ModuleList ([
	MoEExpert (hidden_size ,intermediate_size ,expert_dropout )
	for _ in range (num_experts )
	])

	if use_shared_expert :
	shared_size =shared_expert_intermediate_size or intermediate_size
	self .shared_expert =SharedExpert (
	hidden_size ,shared_size ,expert_dropout ,isolated =isolated_shared
	)
	else :
	self .shared_expert =None

	def forward (self ,hidden_states :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ]:
	batch_size ,seq_len ,hidden_size =hidden_states .shape

	hidden_flat =hidden_states .view (-1 ,hidden_size )
	num_tokens =hidden_flat .shape [0 ]

	top_k_probs ,top_k_indices ,router_logits =self .router (hidden_states )


	if hasattr (self ,'_utilization_tracker'):
	self ._utilization_tracker .record (top_k_indices )

	final_output =torch .zeros_like (hidden_flat )

	for expert_idx in range (self .num_experts ):
	expert =self .experts [expert_idx ]
	for k in range (self .num_experts_per_tok ):
	mask =(top_k_indices [:,k ]==expert_idx )
	if mask .any ():
	expert_input =hidden_flat [mask ]
	expert_output =expert (expert_input )
	weight =top_k_probs [mask ,k :k +1 ]
	final_output [mask ]=final_output [mask ]+weight *expert_output

	if self .shared_expert is not None :
	shared_output =self .shared_expert (hidden_flat )
	final_output =final_output +shared_output

	final_output =final_output .view (batch_size ,seq_len ,hidden_size )


	aux_loss =self ._compute_aux_loss (router_logits ,top_k_indices ,num_tokens )

	return final_output ,aux_loss

	def _compute_aux_loss (self ,router_logits :torch .Tensor ,top_k_indices :torch .Tensor ,
	num_tokens :int )->torch .Tensor :
	device =router_logits .device
	dtype =router_logits .dtype


	if self .aux_lossless :
	z_loss =torch .logsumexp (router_logits ,dim =-1 ).square ().mean ()*0.0001
	return z_loss


	router_probs =F .softmax (router_logits ,dim =-1 ,dtype =dtype )

	expert_mask =F .one_hot (top_k_indices ,self .num_experts ).to (dtype )
	denominator =max (num_tokens *self .num_experts_per_tok ,1 )
	tokens_per_expert =expert_mask .sum (dim =(0 ,1 ))/denominator
	avg_probs =router_probs .mean (dim =0 )
	load_balance_loss =self .num_experts (tokens_per_expert avg_probs ).sum ()


	z_loss =torch .logsumexp (router_logits ,dim =-1 ).square ().mean ()*0.001


	router_probs_safe =router_probs .clamp (EPS ,1.0 -EPS )
	log_probs =torch .log (router_probs_safe )
	entropy =-(router_probs_safe *log_probs ).sum (dim =-1 ).mean ()
	max_entropy =torch .log (torch .tensor (float (self .num_experts ),device =device ,dtype =dtype ))
	entropy_loss =(max_entropy -entropy ).clamp (min =0.0 )*0.01

	expert_usage =(tokens_per_expert >0.01 ).to (dtype ).mean ()
	utilization_loss =(1.0 -expert_usage )*0.1

	total_aux_loss =load_balance_loss +z_loss +entropy_loss +utilization_loss

	return total_aux_loss


	class ExpertChoiceMoELayer (nn .Module ):
	"""
	Expert Choice MoE - FP16 native.
	"""

	def __init__ (
	self ,
	hidden_size :int ,
	intermediate_size :int ,
	num_experts :int =8 ,
	capacity_factor :float =1.0 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_experts =num_experts
	self .capacity_factor =capacity_factor

	self .input_norm =nn .LayerNorm (hidden_size ,eps =1e-5 )
	self .gate =nn .Linear (hidden_size ,num_experts ,bias =False )
	nn .init .normal_ (self .gate .weight ,mean =0.0 ,std =0.01 )

	self .experts =nn .ModuleList ([
	MoEExpert (hidden_size ,intermediate_size )
	for _ in range (num_experts )
	])

	def forward (self ,hidden_states :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ]:
	batch_size ,seq_len ,hidden_size =hidden_states .shape
	hidden_flat =hidden_states .view (-1 ,hidden_size )
	num_tokens =hidden_flat .shape [0 ]

	hidden_norm =self .input_norm (hidden_flat )

	router_logits =self .gate (hidden_norm )
	router_probs =F .softmax (router_logits ,dim =0 ,dtype =hidden_states .dtype )

	capacity =int (num_tokens *self .capacity_factor /self .num_experts )
	capacity =max (capacity ,1 )

	final_output =torch .zeros_like (hidden_flat )
	token_counts =torch .zeros (num_tokens ,device =hidden_flat .device ,dtype =hidden_flat .dtype )

	for expert_idx in range (self .num_experts ):
	expert =self .experts [expert_idx ]
	expert_probs =router_probs [:,expert_idx ]

	top_probs ,top_indices =torch .topk (expert_probs ,min (capacity ,num_tokens ))

	expert_input =hidden_flat [top_indices ]
	expert_output =expert (expert_input )

	final_output [top_indices ]=final_output [top_indices ]+top_probs .unsqueeze (-1 )*expert_output
	token_counts [top_indices ]=token_counts [top_indices ]+top_probs

	token_counts =token_counts .clamp (min =EPS )
	final_output =final_output /token_counts .unsqueeze (-1 )

	final_output =final_output .view (batch_size ,seq_len ,hidden_size )

	aux_loss =torch .logsumexp (router_logits ,dim =-1 ).square ().mean ()*0.001

	return final_output ,aux_loss


	==============================================================================
	MODELS.ENCODERS.VISION
	==============================================================================

	EPS =1e-5


	class RoPE2DEncoder (nn .Module ):
	"""
	2D Rotary Position Embedding for vision encoder patches.
	Matches the 2D-RoPE in image generator for seamless integration.
	"""

	def __init__ (self ,dim :int ,max_height :int =128 ,max_width :int =128 ,base :float =10000.0 ):
	super ().__init__ ()
	self .dim =dim
	self .max_height =max_height
	self .max_width =max_width
	self .base =base

	self .dim_x =dim //2
	self .dim_y =dim -self .dim_x

	inv_freq_x =1.0 /(base **(torch .arange (0 ,self .dim_x ,2 ,dtype =torch .float32 )/self .dim_x ))
	inv_freq_y =1.0 /(base **(torch .arange (0 ,self .dim_y ,2 ,dtype =torch .float32 )/self .dim_y ))

	self .register_buffer ('inv_freq_x',inv_freq_x ,persistent =False )
	self .register_buffer ('inv_freq_y',inv_freq_y ,persistent =False )

	def forward (self ,x :torch .Tensor ,height :int ,width :int )->Tuple [torch .Tensor ,torch .Tensor ]:
	device =x .device
	dtype =x .dtype

	pos_x =torch .arange (width ,device =device ,dtype =torch .float32 )
	pos_y =torch .arange (height ,device =device ,dtype =torch .float32 )

	freqs_x =torch .outer (pos_x ,self .inv_freq_x .to (device ))
	freqs_y =torch .outer (pos_y ,self .inv_freq_y .to (device ))

	freqs_x =torch .cat ([freqs_x ,freqs_x ],dim =-1 )
	freqs_y =torch .cat ([freqs_y ,freqs_y ],dim =-1 )

	cos_2d =torch .zeros (height ,width ,self .dim ,device =device ,dtype =dtype )
	sin_2d =torch .zeros (height ,width ,self .dim ,device =device ,dtype =dtype )

	for y in range (height ):
	for w in range (width ):
	cos_2d [y ,w ,:self .dim_x ]=freqs_x [w ].cos ().to (dtype )
	sin_2d [y ,w ,:self .dim_x ]=freqs_x [w ].sin ().to (dtype )
	cos_2d [y ,w ,self .dim_x :]=freqs_y [y ].cos ().to (dtype )
	sin_2d [y ,w ,self .dim_x :]=freqs_y [y ].sin ().to (dtype )

	cos_2d =cos_2d .view (height *width ,self .dim )
	sin_2d =sin_2d .view (height *width ,self .dim )

	return cos_2d ,sin_2d


	def apply_rope_2d_encoder (x :torch .Tensor ,cos :torch .Tensor ,sin :torch .Tensor )->torch .Tensor :
	"""Apply 2D rotary position embedding to tensor."""
	x1 =x [...,:x .shape [-1 ]//2 ]
	x2 =x [...,x .shape [-1 ]//2 :]
	rotated =torch .cat ((-x2 ,x1 ),dim =-1 )
	return x cos +rotated sin


	class TiTokTokenizer (nn .Module ):
	"""
	TiTok-style 1D Tokenizer for efficient visual representation.
	Converts 2D patch grid to 1D token sequence with learnable compression.
	"""

	def __init__ (self ,hidden_size :int ,num_tokens :int =256 ,num_patches :int =576 ):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_tokens =num_tokens
	self .num_patches =num_patches


	self .compress =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size ),
	nn .GELU (),
	nn .Linear (hidden_size ,hidden_size ),
	)


	self .token_queries =nn .Parameter (torch .randn (1 ,num_tokens ,hidden_size )*0.02 )


	self .compress_attn =nn .MultiheadAttention (
	embed_dim =hidden_size ,
	num_heads =8 ,
	batch_first =True ,
	dropout =0.1 ,
	)
	self .compress_norm =nn .LayerNorm (hidden_size )

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	"""
	Compress patch features to TiTok-style 1D tokens.

	Args:
	x: [B, num_patches, hidden_size] patch features

	Returns:
	[B, num_tokens, hidden_size] compressed token features
	"""
	batch_size =x .shape [0 ]


	queries =self .token_queries .expand (batch_size ,-1 ,-1 )


	x_proj =self .compress (x )
	tokens ,_ =self .compress_attn (queries ,x_proj ,x_proj )
	tokens =self .compress_norm (queries +tokens )

	return tokens


	class DeepStack (nn .Module ):
	"""
	DeepStack: Fuses multi-level ViT features to capture fine-grained details and sharpen image-text alignment.

	SOTA: Instead of using only the final layer features, DeepStack combines features from
	multiple intermediate layers of the vision encoder, enabling:
	- Better fine-grained detail capture (early layers have high-resolution features)
	- Stronger image-text alignment (different layers capture different semantic levels)
	- Improved generation quality for both understanding and generation tasks

	Architecture:
	- Collects features from selected layers (typically: early, middle, late)
	- Projects each level to a common dimension
	- Combines via learned weighted sum or attention
	"""

	def __init__ (self ,hidden_size :int ,num_layers :int =3 ,use_attention :bool =True ):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_layers =num_layers
	self .use_attention =use_attention


	self .level_projs =nn .ModuleList ([
	nn .Linear (hidden_size ,hidden_size )
	for _ in range (num_layers )
	])


	self .level_norms =nn .ModuleList ([
	nn .LayerNorm (hidden_size )
	for _ in range (num_layers )
	])

	if use_attention :

	self .fusion_query =nn .Parameter (torch .randn (1 ,1 ,hidden_size )*0.02 )
	self .fusion_attn =nn .MultiheadAttention (
	embed_dim =hidden_size ,
	num_heads =8 ,
	batch_first =True ,
	dropout =0.1 ,
	)
	self .fusion_norm =nn .LayerNorm (hidden_size )
	else :

	self .level_weights =nn .Parameter (torch .ones (num_layers )/num_layers )


	self .output_proj =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size ),
	nn .GELU (),
	nn .Linear (hidden_size ,hidden_size ),
	)

	def forward (self ,multi_level_features :list )->torch .Tensor :
	"""
	Fuse multi-level features.

	Args:
	multi_level_features: List of [B, seq_len, hidden_size] features from different layers

	Returns:
	[B, seq_len, hidden_size] fused features
	"""
	if len (multi_level_features )!=self .num_layers :

	multi_level_features =multi_level_features [-self .num_layers :]if len (multi_level_features )>self .num_layers else multi_level_features

	batch_size ,seq_len ,_ =multi_level_features [0 ].shape


	projected =[]
	for i ,(feat ,proj ,norm )in enumerate (zip (multi_level_features ,self .level_projs ,self .level_norms )):
	projected .append (norm (proj (feat )))

	if self .use_attention :

	stacked =torch .cat (projected ,dim =1 )


	query =self .fusion_query .expand (batch_size ,seq_len ,-1 )


	fused ,_ =self .fusion_attn (query ,stacked ,stacked )
	fused =self .fusion_norm (query +fused )
	else :

	weights =F .softmax (self .level_weights ,dim =0 )
	fused =sum (w *feat for w ,feat in zip (weights ,projected ))


	return self .output_proj (fused )


	class DualStreamEncoderAttention (nn .Module ):
	"""
	Symmetric Dual-Stream Self-Attention for vision encoding.
	Matches the dual-stream architecture in image generator.
	"""

	def __init__ (self ,hidden_size :int ,num_heads :int =8 ,max_height :int =64 ,max_width :int =64 ):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_heads =num_heads
	self .head_dim =hidden_size //num_heads
	self .scale =self .head_dim **-0.5

	self .to_qkv_a =nn .Linear (hidden_size ,hidden_size *3 ,bias =False )
	self .to_qkv_b =nn .Linear (hidden_size ,hidden_size *3 ,bias =False )

	self .to_out_a =nn .Linear (hidden_size ,hidden_size ,bias =False )
	self .to_out_b =nn .Linear (hidden_size ,hidden_size ,bias =False )

	self .norm_a =nn .LayerNorm (hidden_size )
	self .norm_b =nn .LayerNorm (hidden_size )

	self .rope_2d =RoPE2DEncoder (self .head_dim ,max_height ,max_width )

	def forward (self ,x_a :torch .Tensor ,x_b :torch .Tensor ,height :int ,width :int )->Tuple [torch .Tensor ,torch .Tensor ]:
	batch_size ,seq_len ,_ =x_a .shape

	x_a =self .norm_a (x_a )
	x_b =self .norm_b (x_b )

	qkv_a =self .to_qkv_a (x_a ).reshape (batch_size ,seq_len ,3 ,self .num_heads ,self .head_dim )
	qkv_b =self .to_qkv_b (x_b ).reshape (batch_size ,seq_len ,3 ,self .num_heads ,self .head_dim )

	q_a ,k_a ,v_a =qkv_a .unbind (dim =2 )
	q_b ,k_b ,v_b =qkv_b .unbind (dim =2 )

	cos ,sin =self .rope_2d (x_a ,height ,width )
	cos =cos .unsqueeze (0 ).unsqueeze (0 )
	sin =sin .unsqueeze (0 ).unsqueeze (0 )

	q_a =q_a .transpose (1 ,2 )
	k_a =k_a .transpose (1 ,2 )
	v_a =v_a .transpose (1 ,2 )
	q_b =q_b .transpose (1 ,2 )
	k_b =k_b .transpose (1 ,2 )
	v_b =v_b .transpose (1 ,2 )

	q_a =apply_rope_2d_encoder (q_a ,cos ,sin )
	k_a =apply_rope_2d_encoder (k_a ,cos ,sin )
	q_b =apply_rope_2d_encoder (q_b ,cos ,sin )
	k_b =apply_rope_2d_encoder (k_b ,cos ,sin )


	k_combined =torch .cat ([k_a ,k_b ],dim =2 )
	v_combined =torch .cat ([v_a ,v_b ],dim =2 )

	attn_a =F .scaled_dot_product_attention (q_a ,k_combined ,v_combined )
	attn_b =F .scaled_dot_product_attention (q_b ,k_combined ,v_combined )

	attn_a =attn_a .transpose (1 ,2 ).reshape (batch_size ,seq_len ,self .hidden_size )
	attn_b =attn_b .transpose (1 ,2 ).reshape (batch_size ,seq_len ,self .hidden_size )

	out_a =self .to_out_a (attn_a )
	out_b =self .to_out_b (attn_b )

	return out_a ,out_b


	class VisionEncoderBlock (nn .Module ):
	"""Single block with dual-stream attention and FFN."""

	def __init__ (self ,hidden_size :int ,num_heads :int =8 ,ff_mult :int =4 ,max_height :int =64 ,max_width :int =64 ):
	super ().__init__ ()
	self .dual_attn =DualStreamEncoderAttention (hidden_size ,num_heads ,max_height ,max_width )

	self .ffn_a =nn .Sequential (
	nn .LayerNorm (hidden_size ),
	nn .Linear (hidden_size ,hidden_size *ff_mult ),
	nn .GELU (),
	nn .Linear (hidden_size *ff_mult ,hidden_size ),
	)
	self .ffn_b =nn .Sequential (
	nn .LayerNorm (hidden_size ),
	nn .Linear (hidden_size ,hidden_size *ff_mult ),
	nn .GELU (),
	nn .Linear (hidden_size *ff_mult ,hidden_size ),
	)

	def forward (self ,x_a :torch .Tensor ,x_b :torch .Tensor ,height :int ,width :int )->Tuple [torch .Tensor ,torch .Tensor ]:
	attn_a ,attn_b =self .dual_attn (x_a ,x_b ,height ,width )
	x_a =x_a +attn_a
	x_b =x_b +attn_b
	x_a =x_a +self .ffn_a (x_a )
	x_b =x_b +self .ffn_b (x_b )
	return x_a ,x_b


	class VisionEncoder (nn .Module ):
	"""
	SOTA Vision Encoder with 2D-RoPE, TiTok tokenization, and Dual-Stream Attention.

	Features:
	- SigLIP 2 / CLIP backbone for robust visual features
	- 2D-RoPE for flexible aspect ratios
	- TiTok-style 1D tokenization for efficient representation
	- Dual-stream attention for symmetric processing
	- FP16-native numerical stability
	"""

	def __init__ (
	self ,
	model_name :str ="google/siglip-so400m-patch14-384",
	freeze :bool =False ,
	use_pooled_output :bool =False ,
	use_dual_stream :bool =True ,
	use_titok :bool =True ,
	num_titok_tokens :int =256 ,
	num_dual_stream_layers :int =2 ,
	):
	super ().__init__ ()
	self .model_name =model_name
	self .use_pooled_output =use_pooled_output
	self .use_dual_stream =use_dual_stream
	self .use_titok =use_titok
	self ._is_siglip ="siglip"in model_name .lower ()

	print (f"\n👁️ Loading Vision Encoder: {model_name }")

	if self ._is_siglip :
	self ._init_siglip (model_name ,freeze )
	else :
	self ._init_clip (model_name ,freeze )


	self .rope_2d =RoPE2DEncoder (
	dim =self .hidden_size ,
	max_height =64 ,
	max_width =64 ,
	)
	print (f" 📐 2D-RoPE: Flexible aspect ratio support")


	if use_dual_stream :
	patch_size =getattr (self .vision_model .config ,'patch_size',14 )
	image_size =getattr (self .vision_model .config ,'image_size',384 )
	max_patches =(image_size //patch_size )

	self .dual_stream_layers =nn .ModuleList ([
	VisionEncoderBlock (
	hidden_size =self .hidden_size ,
	num_heads =8 ,
	ff_mult =4 ,
	max_height =max_patches ,
	max_width =max_patches ,
	)
	for _ in range (num_dual_stream_layers )
	])
	print (f" 🔄 Dual-Stream: {num_dual_stream_layers } layers")
	else :
	self .dual_stream_layers =None


	if use_titok :
	self .titok =TiTokTokenizer (
	hidden_size =self .hidden_size ,
	num_tokens =num_titok_tokens ,
	num_patches =self .num_patches ,
	)
	print (f" 🎫 TiTok: {self .num_patches } patches -> {num_titok_tokens } tokens")
	else :
	self .titok =None

	def _init_siglip (self ,model_name :str ,freeze :bool ):
	"""Initialize SigLIP 2 vision encoder."""
	try :
	from transformers import SiglipVisionModel ,SiglipImageProcessor

	self .vision_model =SiglipVisionModel .from_pretrained (model_name )
	self .image_processor =SiglipImageProcessor .from_pretrained (model_name )
	self .hidden_size =self .vision_model .config .hidden_size

	print (f" 🎯 Using SigLIP 2 (recommended for MoE)")
	print (f" ✅ Hidden size: {self .hidden_size }")
	print (f" 📐 Native size: {self .vision_model .config .image_size } (multi-scale: 256-512px)")
	print (f" 🔲 Patch size: {self .vision_model .config .patch_size }")

	except ImportError :
	print (" ⚠️ SigLIP not available, falling back to CLIP")
	self ._is_siglip =False
	self ._init_clip ("openai/clip-vit-large-patch14",freeze )
	return

	if freeze :
	for param in self .vision_model .parameters ():
	param .requires_grad =False
	print (f" ❄️ Vision encoder backbone frozen")
	else :
	print (f" 🔥 Vision encoder backbone trainable")

	def _init_clip (self ,model_name :str ,freeze :bool ):
	"""Initialize CLIP vision encoder (legacy support)."""
	from transformers import CLIPVisionModel ,CLIPImageProcessor

	self .vision_model =CLIPVisionModel .from_pretrained (model_name )
	self .image_processor =CLIPImageProcessor .from_pretrained (model_name )
	self .hidden_size =self .vision_model .config .hidden_size

	print (f" 📎 Using CLIP")
	print (f" ✅ Hidden size: {self .hidden_size }")

	if freeze :
	for param in self .vision_model .parameters ():
	param .requires_grad =False
	print (f" ❄️ Vision encoder backbone frozen")
	else :
	print (f" 🔥 Vision encoder backbone trainable")

	def forward (self ,pixel_values :torch .Tensor ,return_titok :bool =None )->torch .Tensor :
	"""
	Extract vision features from images with SOTA enhancements.

	Args:
	pixel_values: [B, C, H, W] tensor of images
	return_titok: Override for TiTok output (None uses self.use_titok)

	Returns:
	[B, num_tokens, hidden_size] tensor (TiTok) or
	[B, num_patches, hidden_size] tensor (standard) or
	[B, hidden_size] if use_pooled_output=True
	"""
	outputs =self .vision_model (pixel_values =pixel_values )
	features =outputs .last_hidden_state

	if self .use_pooled_output :
	if hasattr (outputs ,'pooler_output')and outputs .pooler_output is not None :
	return outputs .pooler_output
	else :
	return features .mean (dim =1 )


	batch_size ,num_patches ,hidden_size =features .shape
	patch_size =getattr (self .vision_model .config ,'patch_size',14 )
	image_size =getattr (self .vision_model .config ,'image_size',384 )


	if num_patches ==(image_size //patch_size )**2 +1 :

	cls_token =features [:,:1 ]
	features =features [:,1 :]
	num_patches =num_patches -1
	has_cls =True
	else :
	cls_token =None
	has_cls =False

	height =width =int (math .sqrt (num_patches ))


	if self .dual_stream_layers is not None :
	x_a =features
	x_b =features .clone ()

	for layer in self .dual_stream_layers :
	x_a ,x_b =layer (x_a ,x_b ,height ,width )


	features =(x_a +x_b )/2


	use_titok_now =return_titok if return_titok is not None else self .use_titok
	if use_titok_now and self .titok is not None :
	features =self .titok (features )

	return features

	def get_image_processor (self ):
	"""Return the image processor for preprocessing."""
	return self .image_processor

	@property
	def num_patches (self )->int :
	"""Get number of patches for the vision model."""
	config =self .vision_model .config
	image_size =config .image_size
	patch_size =config .patch_size
	return (image_size //patch_size )**2

	@property
	def image_size (self )->int :
	"""Get expected image size."""
	return self .vision_model .config .image_size

	@property
	def output_tokens (self )->int :
	"""Get number of output tokens (considering TiTok compression)."""
	if self .use_titok and self .titok is not None :
	return self .titok .num_tokens
	return self .num_patches



	SIGLIP_MODELS ={

	"siglip-base":"google/siglip-base-patch16-224",
	"siglip-base-384":"google/siglip-base-patch16-384",


	"siglip-large":"google/siglip-large-patch16-256",
	"siglip-large-384":"google/siglip-large-patch16-384",


	"siglip-so400m":"google/siglip-so400m-patch14-384",
	"siglip-so400m-224":"google/siglip-so400m-patch14-224",


	"clip-base":"openai/clip-vit-base-patch16",
	"clip-large":"openai/clip-vit-large-patch14",
	}


	def get_vision_encoder (
	model_key :str ="siglip-so400m",
	freeze :bool =False ,
	use_dual_stream :bool =True ,
	use_titok :bool =True ,
	**kwargs
	)->VisionEncoder :
	"""
	Get a vision encoder by key name with SOTA enhancements.

	Args:
	model_key: Key from SIGLIP_MODELS or full model name
	freeze: Whether to freeze encoder backbone weights
	use_dual_stream: Enable dual-stream attention
	use_titok: Enable TiTok 1D tokenization
	**kwargs: Additional arguments for VisionEncoder

	Returns:
	VisionEncoder instance
	"""
	model_name =SIGLIP_MODELS .get (model_key ,model_key )
	return VisionEncoder (
	model_name =model_name ,
	freeze =freeze ,
	use_dual_stream =use_dual_stream ,
	use_titok =use_titok ,
	**kwargs
	)


	==============================================================================
	MODELS.ENCODERS.VIDEO
	==============================================================================

	EPS =1e-5


	class TextTimestampAlignment (nn .Module ):
	"""
	Text-Timestamp Alignment: Precise timestamp-grounded event localization for stronger video temporal modeling.

	SOTA: Moves beyond T-RoPE by explicitly aligning text descriptions with video timestamps,
	enabling:
	- Precise temporal localization of events described in text
	- Better video captioning with accurate time references
	- Improved video question-answering with temporal reasoning
	- Enhanced video generation with temporal control

	Architecture:
	- Cross-attention between text features and frame-level video features
	- Learnable timestamp embeddings for each frame
	- Temporal alignment loss during training
	"""

	def __init__ (self ,hidden_size :int ,max_frames :int =64 ,num_heads :int =8 ):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .max_frames =max_frames
	self .num_heads =num_heads


	self .timestamp_embedding =nn .Embedding (max_frames ,hidden_size )


	self .video_proj =nn .Linear (hidden_size ,hidden_size )


	self .text_proj =nn .Linear (hidden_size ,hidden_size )


	self .cross_attn =nn .MultiheadAttention (
	embed_dim =hidden_size ,
	num_heads =num_heads ,
	batch_first =True ,
	dropout =0.1 ,
	)


	self .text_norm =nn .LayerNorm (hidden_size )
	self .video_norm =nn .LayerNorm (hidden_size )


	self .alignment_head =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size //2 ),
	nn .GELU (),
	nn .Linear (hidden_size //2 ,1 ),
	)


	self .output_proj =nn .Linear (hidden_size ,hidden_size )

	def forward (
	self ,
	video_features :torch .Tensor ,
	text_features :torch .Tensor ,
	num_frames :int ,
	return_alignment_scores :bool =False ,
	)->Tuple [torch .Tensor ,Optional [torch .Tensor ]]:
	"""
	Align text with video timestamps.

	Args:
	video_features: [B, THW, hidden_size] video features
	text_features: [B, text_len, hidden_size] text features
	num_frames: Number of frames in the video
	return_alignment_scores: Whether to return alignment scores for loss

	Returns:
	aligned_features: [B, THW, hidden_size] timestamp-aligned video features
	alignment_scores: Optional [B, text_len, T] alignment scores
	"""
	batch_size =video_features .shape [0 ]
	total_tokens =video_features .shape [1 ]
	spatial_tokens =total_tokens //num_frames


	timestamp_ids =torch .arange (num_frames ,device =video_features .device )
	timestamp_embeds =self .timestamp_embedding (timestamp_ids )


	timestamp_embeds =timestamp_embeds .unsqueeze (1 ).expand (-1 ,spatial_tokens ,-1 )
	timestamp_embeds =timestamp_embeds .reshape (1 ,total_tokens ,-1 )
	timestamp_embeds =timestamp_embeds .expand (batch_size ,-1 ,-1 )


	video_feat =self .video_norm (self .video_proj (video_features )+timestamp_embeds )
	text_feat =self .text_norm (self .text_proj (text_features ))


	aligned ,attn_weights =self .cross_attn (text_feat ,video_feat ,video_feat )


	alignment_scores =None
	if return_alignment_scores :


	attn_reshaped =attn_weights .view (batch_size ,text_features .shape [1 ],num_frames ,spatial_tokens )
	alignment_scores =attn_reshaped .mean (dim =-1 )


	aligned_text =text_features +self .output_proj (aligned )



	return aligned_text ,alignment_scores


	class AlphaBlender (nn .Module ):
	"""
	AlphaBlender operator from VidTok for temporal blending.
	Blends two inputs with a learnable or fixed alpha parameter.
	"""
	def __init__ (self ,alpha :float =0.55 ):
	super ().__init__ ()
	self .alpha =alpha

	def forward (self ,x1 :torch .Tensor ,x2 :torch .Tensor )->torch .Tensor :
	return self .alpha x1 +(1 -self .alpha )x2


	class VidTokEncoder (nn .Module ):
	"""
	VidTok-style Video Encoder following Microsoft's VidTok architecture.

	SOTA: Implements the VidTok encoder with:
	- 3D convolutions for input and bottleneck (information fusion)
	- 2D convolutions for spatial downsampling (efficiency)
	- AlphaBlender + 1D convolutions for temporal downsampling
	- Layer normalization for stability

	Compresses video [B, C, T, H, W] -> latent [B, latent_dim, t, h, w]
	"""

	def __init__ (
	self ,
	in_channels :int =3 ,
	latent_channels :int =4 ,
	base_channels :int =64 ,
	temporal_downsample :int =4 ,
	spatial_downsample :int =8 ,
	causal :bool =True ,
	):
	super ().__init__ ()
	self .in_channels =in_channels
	self .latent_channels =latent_channels
	self .base_channels =base_channels
	self .temporal_downsample =temporal_downsample
	self .spatial_downsample =spatial_downsample
	self .causal =causal


	self .num_spatial_downs =int (math .log2 (spatial_downsample ))
	self .num_temporal_downs =int (math .log2 (temporal_downsample ))


	self .input_block =nn .Sequential (
	nn .Conv3d (in_channels ,base_channels ,kernel_size =3 ,padding =1 ),
	nn .GroupNorm (8 ,base_channels ),
	nn .SiLU (),
	)


	self .spatial_down_blocks =nn .ModuleList ()
	ch =base_channels
	for i in range (self .num_spatial_downs ):
	out_ch =min (ch *2 ,512 )
	self .spatial_down_blocks .append (
	self ._make_spatial_down_block (ch ,out_ch )
	)
	ch =out_ch


	self .temporal_down_blocks =nn .ModuleList ()
	for i in range (self .num_temporal_downs ):
	self .temporal_down_blocks .append (
	self ._make_temporal_down_block (ch )
	)


	self .bottleneck =nn .Sequential (
	nn .Conv3d (ch ,ch ,kernel_size =3 ,padding =1 ),
	nn .GroupNorm (8 ,ch ),
	nn .SiLU (),
	nn .Conv3d (ch ,ch ,kernel_size =3 ,padding =1 ),
	nn .GroupNorm (8 ,ch ),
	nn .SiLU (),
	)


	self .to_latent =nn .Conv3d (ch ,latent_channels ,kernel_size =1 )

	print (f" 🎬 VidTokEncoder: {in_channels }ch -> {latent_channels }ch latent")
	print (f" Spatial: {spatial_downsample }x down ({self .num_spatial_downs } stages)")
	print (f" Temporal: {temporal_downsample }x down ({self .num_temporal_downs } stages)")

	def _make_spatial_down_block (self ,in_ch :int ,out_ch :int )->nn .Module :
	"""Create a spatial downsampling block using 2D convolutions."""
	return nn .Sequential (

	Rearrange3Dto2D (),
	nn .Conv2d (in_ch ,out_ch ,kernel_size =3 ,stride =2 ,padding =1 ),
	nn .GroupNorm (8 ,out_ch ),
	nn .SiLU (),
	nn .Conv2d (out_ch ,out_ch ,kernel_size =3 ,padding =1 ),
	nn .GroupNorm (8 ,out_ch ),
	nn .SiLU (),

	Rearrange2Dto3D (),
	)

	def _make_temporal_down_block (self ,channels :int )->nn .Module :
	"""Create a temporal downsampling block using AlphaBlender + 1D conv."""
	return TemporalDownBlock (channels ,causal =self .causal )

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	"""
	Encode video to latent space.

	Args:
	x: [B, C, T, H, W] input video

	Returns:
	[B, latent_channels, t, h, w] latent representation
	"""

	B ,C ,T ,H ,W =x .shape


	x =self .input_block (x )


	for block in self .spatial_down_blocks :

	if hasattr (block [0 ],'set_temporal_dim'):
	block [0 ].set_temporal_dim (x .shape [2 ])
	if hasattr (block [-1 ],'set_temporal_dim'):
	block [-1 ].set_temporal_dim (x .shape [2 ])
	x =block (x )


	for block in self .temporal_down_blocks :
	x =block (x )


	x =self .bottleneck (x )


	x =self .to_latent (x )

	return x


	class VidTokDecoder (nn .Module ):
	"""
	VidTok-style Video Decoder following Microsoft's VidTok architecture.

	Reconstructs video from latent [B, latent_dim, t, h, w] -> [B, C, T, H, W]
	"""

	def __init__ (
	self ,
	out_channels :int =3 ,
	latent_channels :int =4 ,
	base_channels :int =64 ,
	temporal_upsample :int =4 ,
	spatial_upsample :int =8 ,
	causal :bool =True ,
	):
	super ().__init__ ()
	self .out_channels =out_channels
	self .latent_channels =latent_channels
	self .base_channels =base_channels
	self .temporal_upsample =temporal_upsample
	self .spatial_upsample =spatial_upsample
	self .causal =causal

	self .num_spatial_ups =int (math .log2 (spatial_upsample ))
	self .num_temporal_ups =int (math .log2 (temporal_upsample ))


	ch =min (base_channels (2 *self .num_spatial_ups ),512 )


	self .from_latent =nn .Conv3d (latent_channels ,ch ,kernel_size =1 )


	self .bottleneck =nn .Sequential (
	nn .Conv3d (ch ,ch ,kernel_size =3 ,padding =1 ),
	nn .GroupNorm (8 ,ch ),
	nn .SiLU (),
	nn .Conv3d (ch ,ch ,kernel_size =3 ,padding =1 ),
	nn .GroupNorm (8 ,ch ),
	nn .SiLU (),
	)


	self .temporal_up_blocks =nn .ModuleList ()
	for i in range (self .num_temporal_ups ):
	self .temporal_up_blocks .append (
	TemporalUpBlock (ch ,causal =self .causal )
	)


	self .spatial_up_blocks =nn .ModuleList ()
	for i in range (self .num_spatial_ups ):
	out_ch =max (ch //2 ,base_channels )
	self .spatial_up_blocks .append (
	self ._make_spatial_up_block (ch ,out_ch )
	)
	ch =out_ch


	self .output_block =nn .Sequential (
	nn .Conv3d (ch ,out_channels ,kernel_size =3 ,padding =1 ),
	nn .Tanh (),
	)

	print (f" 🎬 VidTokDecoder: {latent_channels }ch latent -> {out_channels }ch")

	def _make_spatial_up_block (self ,in_ch :int ,out_ch :int )->nn .Module :
	"""Create a spatial upsampling block using 2D convolutions."""
	return nn .Sequential (
	Rearrange3Dto2D (),
	nn .ConvTranspose2d (in_ch ,out_ch ,kernel_size =4 ,stride =2 ,padding =1 ),
	nn .GroupNorm (8 ,out_ch ),
	nn .SiLU (),
	nn .Conv2d (out_ch ,out_ch ,kernel_size =3 ,padding =1 ),
	nn .GroupNorm (8 ,out_ch ),
	nn .SiLU (),
	Rearrange2Dto3D (),
	)

	def forward (self ,z :torch .Tensor )->torch .Tensor :
	"""
	Decode latent to video.

	Args:
	z: [B, latent_channels, t, h, w] latent representation

	Returns:
	[B, C, T, H, W] reconstructed video
	"""
	x =self .from_latent (z )
	x =self .bottleneck (x )

	for block in self .temporal_up_blocks :
	x =block (x )

	for block in self .spatial_up_blocks :
	x =block (x )

	x =self .output_block (x )
	return x


	class Rearrange3Dto2D (nn .Module ):
	"""Reshape [B, C, T, H, W] -> [B*T, C, H, W] for 2D operations."""
	def __init__ (self ):
	super ().__init__ ()
	self .temporal_dim =None

	def set_temporal_dim (self ,t :int ):
	self .temporal_dim =t

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	B ,C ,T ,H ,W =x .shape
	self .temporal_dim =T
	return x .permute (0 ,2 ,1 ,3 ,4 ).reshape (B *T ,C ,H ,W )


	class Rearrange2Dto3D (nn .Module ):
	"""Reshape [B*T, C, H, W] -> [B, C, T, H, W] after 2D operations."""
	def __init__ (self ):
	super ().__init__ ()
	self .temporal_dim =None

	def set_temporal_dim (self ,t :int ):
	self .temporal_dim =t

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	BT ,C ,H ,W =x .shape
	T =self .temporal_dim if self .temporal_dim else 1
	B =BT //T
	return x .reshape (B ,T ,C ,H ,W ).permute (0 ,2 ,1 ,3 ,4 )


	class TemporalDownBlock (nn .Module ):
	"""Temporal downsampling using AlphaBlender + 1D conv (VidTok style)."""
	def __init__ (self ,channels :int ,causal :bool =True ):
	super ().__init__ ()
	self .channels =channels
	self .causal =causal
	self .alpha_blender =AlphaBlender ()


	padding =(1 ,0 )if causal else 1
	self .temporal_conv =nn .Conv1d (channels ,channels ,kernel_size =2 ,stride =2 ,padding =0 )
	self .norm =nn .GroupNorm (8 ,channels )
	self .act =nn .SiLU ()

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	"""
	Args:
	x: [B, C, T, H, W]
	Returns:
	[B, C, T//2, H, W]
	"""
	B ,C ,T ,H ,W =x .shape


	x =x .permute (0 ,3 ,4 ,1 ,2 ).reshape (B H W ,C ,T )


	x =self .temporal_conv (x )
	x =self .norm (x .unsqueeze (-1 )).squeeze (-1 )
	x =self .act (x )


	T_new =x .shape [2 ]
	x =x .reshape (B ,H ,W ,C ,T_new ).permute (0 ,3 ,4 ,1 ,2 )

	return x


	class TemporalUpBlock (nn .Module ):
	"""Temporal upsampling using AlphaBlender + 1D conv (VidTok style)."""
	def __init__ (self ,channels :int ,causal :bool =True ):
	super ().__init__ ()
	self .channels =channels
	self .causal =causal
	self .alpha_blender =AlphaBlender ()


	self .temporal_conv =nn .ConvTranspose1d (channels ,channels ,kernel_size =2 ,stride =2 )
	self .norm =nn .GroupNorm (8 ,channels )
	self .act =nn .SiLU ()

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	"""
	Args:
	x: [B, C, T, H, W]
	Returns:
	[B, C, T*2, H, W]
	"""
	B ,C ,T ,H ,W =x .shape


	x =x .permute (0 ,3 ,4 ,1 ,2 ).reshape (B H W ,C ,T )


	x =self .temporal_conv (x )
	x =self .norm (x .unsqueeze (-1 )).squeeze (-1 )
	x =self .act (x )


	T_new =x .shape [2 ]
	x =x .reshape (B ,H ,W ,C ,T_new ).permute (0 ,3 ,4 ,1 ,2 )

	return x


	class VidTokTokenizer (nn .Module ):
	"""
	VidTok-style Video Tokenizer (3D VAE) following Microsoft's VidTok architecture.

	SOTA: Full encoder-decoder architecture for video compression to latent space.
	- Efficient 2D+1D architecture (separates spatial and temporal processing)
	- AlphaBlender for temporal blending
	- Supports both continuous (KL) and discrete (FSQ) tokenization
	- Causal mode for streaming/autoregressive applications

	Compresses video [B, C, T, H, W] -> latent [B, latent_dim, t, h, w]
	"""

	def __init__ (
	self ,
	in_channels :int =3 ,
	latent_channels :int =4 ,
	base_channels :int =64 ,
	temporal_compression :int =4 ,
	spatial_compression :int =8 ,
	causal :bool =True ,
	use_fsq :bool =False ,
	fsq_levels :int =8 ,
	):
	super ().__init__ ()
	self .in_channels =in_channels
	self .latent_channels =latent_channels
	self .temporal_compression =temporal_compression
	self .spatial_compression =spatial_compression
	self .causal =causal
	self .use_fsq =use_fsq
	self .fsq_levels =fsq_levels


	self .encoder =VidTokEncoder (
	in_channels =in_channels ,
	latent_channels =latent_channels *2 if not use_fsq else latent_channels ,
	base_channels =base_channels ,
	temporal_downsample =temporal_compression ,
	spatial_downsample =spatial_compression ,
	causal =causal ,
	)


	self .decoder =VidTokDecoder (
	out_channels =in_channels ,
	latent_channels =latent_channels ,
	base_channels =base_channels ,
	temporal_upsample =temporal_compression ,
	spatial_upsample =spatial_compression ,
	causal =causal ,
	)

	print (f" 🎬 VidTokTokenizer: {temporal_compression }x{spatial_compression }x{spatial_compression } compression")
	print (f" Mode: {'FSQ (discrete)'if use_fsq else 'KL (continuous)'}, Causal: {causal }")

	def encode (self ,x :torch .Tensor )->torch .Tensor :
	"""Encode video to latent space."""
	h =self .encoder (x )

	if self .use_fsq :

	return self ._fsq_quantize (h )
	else :

	mean ,logvar =h .chunk (2 ,dim =1 )
	std =torch .exp (0.5 *logvar )
	eps =torch .randn_like (std )
	return mean +eps *std

	def decode (self ,z :torch .Tensor )->torch .Tensor :
	"""Decode latent to video."""
	return self .decoder (z )

	def _fsq_quantize (self ,z :torch .Tensor )->torch .Tensor :
	"""Finite Scalar Quantization - quantize each channel independently."""

	z =torch .tanh (z )
	z =torch .round ((z +1 )(self .fsq_levels -1 )/2 )2 /(self .fsq_levels -1 )-1
	return z

	def forward (self ,x :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ]:
	"""
	Full forward pass: encode then decode.

	Args:
	x: [B, C, T, H, W] input video

	Returns:
	Tuple of (reconstructed video, latent representation)
	"""
	z =self .encode (x )
	x_recon =self .decode (z )
	return x_recon ,z


	class RoPE3DEncoder (nn .Module ):
	"""
	3D Rotary Position Embedding for (x, y, t) dimensions.
	Matches the 3D-RoPE in video generator for seamless integration.
	"""

	def __init__ (self ,dim :int ,max_height :int =64 ,max_width :int =64 ,max_frames :int =32 ,base :float =10000.0 ):
	super ().__init__ ()
	self .dim =dim
	self .max_height =max_height
	self .max_width =max_width
	self .max_frames =max_frames
	self .base =base

	dim_per_axis =dim //3
	self .dim_x =dim_per_axis
	self .dim_y =dim_per_axis
	self .dim_t =dim -2 *dim_per_axis

	inv_freq_x =1.0 /(base **(torch .arange (0 ,self .dim_x ,2 ,dtype =torch .float32 )/self .dim_x ))
	inv_freq_y =1.0 /(base **(torch .arange (0 ,self .dim_y ,2 ,dtype =torch .float32 )/self .dim_y ))
	inv_freq_t =1.0 /(base **(torch .arange (0 ,self .dim_t ,2 ,dtype =torch .float32 )/self .dim_t ))

	self .register_buffer ('inv_freq_x',inv_freq_x ,persistent =False )
	self .register_buffer ('inv_freq_y',inv_freq_y ,persistent =False )
	self .register_buffer ('inv_freq_t',inv_freq_t ,persistent =False )

	def forward (self ,x :torch .Tensor ,height :int ,width :int ,frames :int )->Tuple [torch .Tensor ,torch .Tensor ]:
	device =x .device
	dtype =x .dtype

	pos_x =torch .arange (width ,device =device ,dtype =torch .float32 )
	pos_y =torch .arange (height ,device =device ,dtype =torch .float32 )
	pos_t =torch .arange (frames ,device =device ,dtype =torch .float32 )

	freqs_x =torch .outer (pos_x ,self .inv_freq_x .to (device ))
	freqs_y =torch .outer (pos_y ,self .inv_freq_y .to (device ))
	freqs_t =torch .outer (pos_t ,self .inv_freq_t .to (device ))

	freqs_x =torch .cat ([freqs_x ,freqs_x ],dim =-1 )
	freqs_y =torch .cat ([freqs_y ,freqs_y ],dim =-1 )
	freqs_t =torch .cat ([freqs_t ,freqs_t ],dim =-1 )

	cos_x =freqs_x .cos ().to (dtype )
	sin_x =freqs_x .sin ().to (dtype )
	cos_y =freqs_y .cos ().to (dtype )
	sin_y =freqs_y .sin ().to (dtype )
	cos_t =freqs_t .cos ().to (dtype )
	sin_t =freqs_t .sin ().to (dtype )

	cos_3d =torch .zeros (frames ,height ,width ,self .dim ,device =device ,dtype =dtype )
	sin_3d =torch .zeros (frames ,height ,width ,self .dim ,device =device ,dtype =dtype )

	for t in range (frames ):
	for y in range (height ):
	for w in range (width ):
	cos_3d [t ,y ,w ,:self .dim_x ]=cos_x [w ]
	sin_3d [t ,y ,w ,:self .dim_x ]=sin_x [w ]
	cos_3d [t ,y ,w ,self .dim_x :self .dim_x +self .dim_y ]=cos_y [y ]
	sin_3d [t ,y ,w ,self .dim_x :self .dim_x +self .dim_y ]=sin_y [y ]
	cos_3d [t ,y ,w ,self .dim_x +self .dim_y :]=cos_t [t ]
	sin_3d [t ,y ,w ,self .dim_x +self .dim_y :]=sin_t [t ]

	cos_3d =cos_3d .view (frames height width ,self .dim )
	sin_3d =sin_3d .view (frames height width ,self .dim )

	return cos_3d ,sin_3d


	def apply_rope_3d_encoder (x :torch .Tensor ,cos :torch .Tensor ,sin :torch .Tensor )->torch .Tensor :
	"""Apply 3D rotary position embedding to tensor."""
	x1 =x [...,:x .shape [-1 ]//2 ]
	x2 =x [...,x .shape [-1 ]//2 :]
	rotated =torch .cat ((-x2 ,x1 ),dim =-1 )
	return x cos +rotated sin


	class TemporalExpertRouterEncoder (nn .Module ):
	"""
	Temporal-Aware Expert Router for video encoding.
	Routes tokens based on temporal context and motion patterns.
	"""

	def __init__ (self ,hidden_size :int ,num_experts :int =4 ,top_k :int =2 ):
	super ().__init__ ()
	self .num_experts =num_experts
	self .top_k =top_k

	self .temporal_proj =nn .Linear (hidden_size ,hidden_size )
	self .gate =nn .Linear (hidden_size ,num_experts ,bias =False )
	nn .init .normal_ (self .gate .weight ,mean =0.0 ,std =0.01 )

	def forward (self ,x :torch .Tensor ,temporal_context :Optional [torch .Tensor ]=None )->Tuple [torch .Tensor ,torch .Tensor ]:
	if temporal_context is not None :
	x =x +self .temporal_proj (temporal_context )

	router_logits =self .gate (x )
	router_probs =F .softmax (router_logits ,dim =-1 ,dtype =x .dtype )

	top_k_probs ,top_k_indices =torch .topk (router_probs ,self .top_k ,dim =-1 )
	top_k_probs =top_k_probs /(top_k_probs .sum (dim =-1 ,keepdim =True )+EPS )

	return top_k_probs ,top_k_indices


	class VideoExpertEncoder (nn .Module ):
	"""Single expert for video encoding with SwiGLU."""

	def __init__ (self ,hidden_size :int ,intermediate_size :int ):
	super ().__init__ ()
	self .gate_proj =nn .Linear (hidden_size ,intermediate_size ,bias =False )
	self .up_proj =nn .Linear (hidden_size ,intermediate_size ,bias =False )
	self .down_proj =nn .Linear (intermediate_size ,hidden_size ,bias =False )
	self .act_fn =nn .SiLU ()

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	return self .down_proj (self .act_fn (self .gate_proj (x ))*self .up_proj (x ))


	class TemporalMoELayerEncoder (nn .Module ):
	"""
	Temporal-Aware MoE Layer for video encoding.
	Uses motion-aware routing for expert selection.
	"""

	def __init__ (self ,hidden_size :int ,intermediate_size :int ,num_experts :int =4 ,top_k :int =2 ):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_experts =num_experts
	self .top_k =top_k

	self .router =TemporalExpertRouterEncoder (hidden_size ,num_experts ,top_k )
	self .experts =nn .ModuleList ([
	VideoExpertEncoder (hidden_size ,intermediate_size )
	for _ in range (num_experts )
	])
	self .shared_expert =VideoExpertEncoder (hidden_size ,intermediate_size )

	def forward (self ,x :torch .Tensor ,temporal_context :Optional [torch .Tensor ]=None )->torch .Tensor :
	batch_size ,seq_len ,hidden_size =x .shape
	x_flat =x .view (-1 ,hidden_size )

	top_k_probs ,top_k_indices =self .router (x_flat ,temporal_context .view (-1 ,hidden_size )if temporal_context is not None else None )

	output =torch .zeros_like (x_flat )

	for expert_idx in range (self .num_experts ):
	expert =self .experts [expert_idx ]
	for k in range (self .top_k ):
	mask =(top_k_indices [:,k ]==expert_idx )
	if mask .any ():
	expert_input =x_flat [mask ]
	expert_output =expert (expert_input )
	weight =top_k_probs [mask ,k :k +1 ]
	output [mask ]=output [mask ]+weight *expert_output

	shared_output =self .shared_expert (x_flat )
	output =output +shared_output

	return output .view (batch_size ,seq_len ,hidden_size )


	class Causal3DAttentionEncoder (nn .Module ):
	"""
	3D Causal Self-Attention with 3D-RoPE for video encoding.
	Attends to all positions for encoding (non-causal during encoding).
	"""

	def __init__ (self ,hidden_size :int ,num_heads :int =8 ,max_frames :int =32 ,max_height :int =64 ,max_width :int =64 ):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_heads =num_heads
	self .head_dim =hidden_size //num_heads
	self .scale =self .head_dim **-0.5

	self .to_qkv =nn .Linear (hidden_size ,hidden_size *3 ,bias =False )
	self .to_out =nn .Linear (hidden_size ,hidden_size ,bias =False )

	self .norm =nn .LayerNorm (hidden_size )

	self .rope_3d =RoPE3DEncoder (self .head_dim ,max_height ,max_width ,max_frames )

	def forward (self ,x :torch .Tensor ,height :int ,width :int ,frames :int ,causal :bool =False )->torch .Tensor :
	batch_size ,seq_len ,_ =x .shape

	x_norm =self .norm (x )
	qkv =self .to_qkv (x_norm ).reshape (batch_size ,seq_len ,3 ,self .num_heads ,self .head_dim )
	q ,k ,v =qkv .unbind (dim =2 )

	cos ,sin =self .rope_3d (x ,height ,width ,frames )
	cos =cos .unsqueeze (0 ).unsqueeze (2 )
	sin =sin .unsqueeze (0 ).unsqueeze (2 )

	q =q .transpose (1 ,2 )
	k =k .transpose (1 ,2 )
	v =v .transpose (1 ,2 )

	q =apply_rope_3d_encoder (q ,cos ,sin )
	k =apply_rope_3d_encoder (k ,cos ,sin )


	if causal :
	attn_output =F .scaled_dot_product_attention (q ,k ,v ,is_causal =True )
	else :
	attn_output =F .scaled_dot_product_attention (q ,k ,v )

	attn_output =attn_output .transpose (1 ,2 ).reshape (batch_size ,seq_len ,self .hidden_size )
	return self .to_out (attn_output )


	class VideoEncoderBlock (nn .Module ):
	"""Single block with 3D causal attention and temporal MoE FFN."""

	def __init__ (
	self ,
	hidden_size :int ,
	num_heads :int =8 ,
	num_experts :int =4 ,
	max_frames :int =32 ,
	max_height :int =64 ,
	max_width :int =64 ,
	):
	super ().__init__ ()
	self .attn =Causal3DAttentionEncoder (hidden_size ,num_heads ,max_frames ,max_height ,max_width )
	self .moe =TemporalMoELayerEncoder (hidden_size ,hidden_size *4 ,num_experts )
	self .norm =nn .LayerNorm (hidden_size )

	def forward (self ,x :torch .Tensor ,height :int ,width :int ,frames :int ,causal :bool =False )->torch .Tensor :
	x =x +self .attn (x ,height ,width ,frames ,causal )
	x =self .norm (x +self .moe (x ))
	return x


	class VideoTiTokTokenizer (nn .Module ):
	"""
	SOTA TiTok-style 1D Tokenizer for video features with temporal awareness.

	This compresses encoded video features (from vision encoder) to a smaller
	number of tokens, similar to how TiTokTokenizer works for images but with
	proper temporal modeling.

	SOTA Features:
	- Multi-layer transformer with temporal-aware attention
	- 3D positional encoding (spatial + temporal)
	- Hierarchical compression: spatial first, then temporal
	- Causal temporal attention for streaming compatibility
	- Gated cross-attention for selective feature extraction

	Note: This is different from VidTokTokenizer which is a 3D VAE for raw video compression.
	This tokenizer operates on already-encoded features, not raw pixels.

	Converts [B, THW, hidden_size] -> [B, num_tokens, hidden_size]
	"""

	def __init__ (
	self ,
	hidden_size :int ,
	num_tokens :int =64 ,
	num_patches :int =576 ,
	max_frames :int =32 ,
	num_layers :int =2 ,
	num_heads :int =8 ,
	dropout :float =0.1 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_tokens =num_tokens
	self .num_patches =num_patches
	self .max_frames =max_frames
	self .num_heads =num_heads


	self .patches_per_frame =num_patches //max_frames if max_frames >0 else num_patches
	self .spatial_size =int (self .patches_per_frame **0.5 )


	self .temporal_pos =nn .Parameter (torch .randn (1 ,max_frames ,1 ,hidden_size )*0.02 )
	self .spatial_pos =nn .Parameter (torch .randn (1 ,1 ,self .patches_per_frame ,hidden_size )*0.02 )


	self .input_norm =nn .LayerNorm (hidden_size )
	self .input_proj =nn .Linear (hidden_size ,hidden_size )



	self .num_temporal_tokens =min (num_tokens //4 ,max_frames )
	self .num_content_tokens =num_tokens -self .num_temporal_tokens

	self .temporal_queries =nn .Parameter (torch .randn (1 ,self .num_temporal_tokens ,hidden_size )*0.02 )
	self .content_queries =nn .Parameter (torch .randn (1 ,self .num_content_tokens ,hidden_size )*0.02 )


	self .compress_layers =nn .ModuleList ()
	for i in range (num_layers ):
	self .compress_layers .append (nn .ModuleDict ({

	'cross_attn':nn .MultiheadAttention (
	embed_dim =hidden_size ,
	num_heads =num_heads ,
	batch_first =True ,
	dropout =dropout ,
	),
	'cross_gate':nn .Sequential (
	nn .Linear (hidden_size ,hidden_size ),
	nn .Sigmoid (),
	),
	'cross_norm':nn .LayerNorm (hidden_size ),

	'self_attn':nn .MultiheadAttention (
	embed_dim =hidden_size ,
	num_heads =num_heads ,
	batch_first =True ,
	dropout =dropout ,
	),
	'self_norm':nn .LayerNorm (hidden_size ),

	'ffn':nn .Sequential (
	nn .Linear (hidden_size ,hidden_size *4 ),
	nn .GELU (),
	nn .Dropout (dropout ),
	nn .Linear (hidden_size *4 ,hidden_size ),
	nn .Dropout (dropout ),
	),
	'ffn_norm':nn .LayerNorm (hidden_size ),
	}))


	self .fusion_attn =nn .MultiheadAttention (
	embed_dim =hidden_size ,
	num_heads =num_heads ,
	batch_first =True ,
	dropout =dropout ,
	)
	self .fusion_norm =nn .LayerNorm (hidden_size )


	self .output_proj =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size ),
	nn .GELU (),
	nn .Linear (hidden_size ,hidden_size ),
	)
	self .output_norm =nn .LayerNorm (hidden_size )

	print (f" 🎬 VideoTiTokTokenizer: {num_patches } patches -> {num_tokens } tokens")
	print (f" Temporal tokens: {self .num_temporal_tokens }, Content tokens: {self .num_content_tokens }")
	print (f" Layers: {num_layers }, Heads: {num_heads }")

	def _load_from_state_dict (self ,state_dict ,prefix ,local_metadata ,strict ,missing_keys ,unexpected_keys ,error_msgs ):
	"""Production-grade hook to handle dynamic frame counts and token counts when loading checkpoints."""


	t_pos_key =prefix +'temporal_pos'
	if t_pos_key in state_dict :
	ckpt_pos =state_dict [t_pos_key ]
	if ckpt_pos .shape !=self .temporal_pos .shape :
	print (f" ⚠️ VideoTiTokTokenizer: Interpolating {t_pos_key } from {ckpt_pos .shape [1 ]} to {self .max_frames } frames.")

	ckpt_pos =ckpt_pos .squeeze (2 ).transpose (1 ,2 )
	resized =F .interpolate (ckpt_pos ,size =self .max_frames ,mode ='linear',align_corners =False )
	state_dict [t_pos_key ]=resized .transpose (1 ,2 ).unsqueeze (2 )


	t_query_key =prefix +'temporal_queries'
	if t_query_key in state_dict :
	ckpt_query =state_dict [t_query_key ]
	if ckpt_query .shape !=self .temporal_queries .shape :
	print (f" ⚠️ VideoTiTokTokenizer: Interpolating {t_query_key } from {ckpt_query .shape [1 ]} to {self .num_temporal_tokens } tokens.")
	ckpt_query =ckpt_query .transpose (1 ,2 )
	resized =F .interpolate (ckpt_query ,size =self .num_temporal_tokens ,mode ='linear',align_corners =False )
	state_dict [t_query_key ]=resized .transpose (1 ,2 )


	c_query_key =prefix +'content_queries'
	if c_query_key in state_dict :
	ckpt_query =state_dict [c_query_key ]
	if ckpt_query .shape !=self .content_queries .shape :
	print (f" ⚠️ VideoTiTokTokenizer: Interpolating {c_query_key } from {ckpt_query .shape [1 ]} to {self .num_content_tokens } tokens.")
	ckpt_query =ckpt_query .transpose (1 ,2 )
	resized =F .interpolate (ckpt_query ,size =self .num_content_tokens ,mode ='linear',align_corners =False )
	state_dict [c_query_key ]=resized .transpose (1 ,2 )

	super ()._load_from_state_dict (state_dict ,prefix ,local_metadata ,strict ,missing_keys ,unexpected_keys ,error_msgs )

	def _add_3d_pos_encoding (self ,x :torch .Tensor ,num_frames :int ,patches_per_frame :int )->torch .Tensor :
	"""Add 3D positional encoding (temporal + spatial)."""
	B ,seq_len ,D =x .shape


	x =x .reshape (B ,num_frames ,patches_per_frame ,D )


	temporal_pos =self .temporal_pos [:,:num_frames ,:,:]
	x =x +temporal_pos


	spatial_pos =self .spatial_pos [:,:,:patches_per_frame ,:]
	x =x +spatial_pos


	return x .reshape (B ,seq_len ,D )

	def forward (self ,x :torch .Tensor ,num_frames :int =None )->torch .Tensor :
	"""
	Compress video patch features to TiTok-style 1D tokens.

	Args:
	x: [B, THW, hidden_size] video patch features (flattened spatial-temporal)
	or [B, T, H*W, hidden_size] video patch features per frame
	num_frames: Number of frames (optional, for temporal embedding)

	Returns:
	[B, num_tokens, hidden_size] compressed token features
	"""
	batch_size =x .shape [0 ]


	if x .dim ()==4 :

	B ,T ,HW ,D =x .shape
	x =x .reshape (B ,T *HW ,D )
	num_frames =T
	patches_per_frame =HW
	else :

	seq_len =x .shape [1 ]
	if num_frames is None :
	num_frames =min (self .max_frames ,seq_len //self .patches_per_frame )
	num_frames =max (1 ,num_frames )
	patches_per_frame =seq_len //num_frames if num_frames >0 else seq_len


	x =self .input_norm (x )
	x =self .input_proj (x )


	x =self ._add_3d_pos_encoding (x ,num_frames ,patches_per_frame )


	temporal_queries =self .temporal_queries [:,:min (self .num_temporal_tokens ,num_frames ),:].expand (batch_size ,-1 ,-1 )
	content_queries =self .content_queries .expand (batch_size ,-1 ,-1 )
	queries =torch .cat ([temporal_queries ,content_queries ],dim =1 )


	for layer in self .compress_layers :

	cross_out ,_ =layer ['cross_attn'](queries ,x ,x )
	gate =layer ['cross_gate'](queries )
	queries =layer ['cross_norm'](queries +gate *cross_out )


	self_out ,_ =layer ['self_attn'](queries ,queries ,queries )
	queries =layer ['self_norm'](queries +self_out )


	ffn_out =layer ['ffn'](queries )
	queries =layer ['ffn_norm'](queries +ffn_out )


	actual_temporal =temporal_queries .shape [1 ]
	temporal_tokens =queries [:,:actual_temporal ,:]
	content_tokens =queries [:,actual_temporal :,:]

	fused ,_ =self .fusion_attn (content_tokens ,temporal_tokens ,temporal_tokens )
	content_tokens =self .fusion_norm (content_tokens +fused )


	tokens =torch .cat ([temporal_tokens ,content_tokens ],dim =1 )


	if tokens .shape [1 ]<self .num_tokens :

	pad_size =self .num_tokens -tokens .shape [1 ]
	pad_tokens =self .content_queries [:,:pad_size ,:].expand (batch_size ,-1 ,-1 )
	tokens =torch .cat ([tokens ,pad_tokens ],dim =1 )
	elif tokens .shape [1 ]>self .num_tokens :
	tokens =tokens [:,:self .num_tokens ,:]


	tokens =self .output_proj (tokens )
	tokens =self .output_norm (tokens )

	return tokens


	class VideoEncoder (nn .Module ):
	"""
	SOTA Video Encoder with 3D-RoPE, 3D Causal Attention, Temporal Expert Routing, and VidTokTokenizer.

	Features:
	- 3D-RoPE for flexible (x, y, t) positional encodings
	- 3D Causal Attention for temporal understanding
	- Temporal-Aware Expert Routing for motion patterns
	- VidTokTokenizer for efficient 1D token compression (mirrors TiTokTokenizer for images)
	- Integrated with vision encoder backbone
	- FP16-native numerical stability
	"""

	def __init__ (
	self ,
	vision_encoder :VisionEncoder ,
	max_frames :int =32 ,
	num_encoder_layers :int =4 ,
	num_experts :int =4 ,
	use_3d_rope :bool =True ,
	use_temporal_moe :bool =True ,
	use_video_tokenizer :bool =True ,
	num_video_tokens :int =64 ,
	):
	super ().__init__ ()
	self .vision_encoder =vision_encoder
	self .max_frames =max_frames
	self .hidden_size =vision_encoder .hidden_size
	self .use_3d_rope =use_3d_rope
	self .use_temporal_moe =use_temporal_moe
	self .use_video_tokenizer =use_video_tokenizer


	self .image_size =getattr (vision_encoder ,'image_size',384 )
	self .patch_size =getattr (vision_encoder .vision_model .config ,'patch_size',14 )
	self .patches_per_side =self .image_size //self .patch_size
	self .num_spatial_tokens =self .patches_per_side **2


	if use_3d_rope :
	self .rope_3d =RoPE3DEncoder (
	dim =self .hidden_size ,
	max_height =self .patches_per_side ,
	max_width =self .patches_per_side ,
	max_frames =max_frames ,
	)
	print (f" 📐 3D-RoPE: (x,y,t) position encoding")
	else :
	self .rope_3d =None


	self .encoder_blocks =nn .ModuleList ([
	VideoEncoderBlock (
	hidden_size =self .hidden_size ,
	num_heads =8 ,
	num_experts =num_experts if use_temporal_moe else 1 ,
	max_frames =max_frames ,
	max_height =self .patches_per_side ,
	max_width =self .patches_per_side ,
	)
	for _ in range (num_encoder_layers )
	])
	print (f" 🎬 3D Causal Transformer: {num_encoder_layers } layers")

	if use_temporal_moe :
	print (f" 🎯 Temporal MoE: {num_experts } experts per layer")




	if use_video_tokenizer :
	self .vidtok =VideoTiTokTokenizer (
	hidden_size =self .hidden_size ,
	num_tokens =num_video_tokens ,
	num_patches =self .num_spatial_tokens *max_frames ,
	max_frames =max_frames ,
	)

	self .video_tokenizer =self .vidtok
	else :
	self .vidtok =None
	self .video_tokenizer =None


	self .temporal_pool_query =nn .Parameter (torch .randn (1 ,1 ,self .hidden_size )*0.02 )
	self .temporal_pool_attn =nn .MultiheadAttention (
	embed_dim =self .hidden_size ,
	num_heads =8 ,
	batch_first =True ,
	dropout =0.1 ,
	)
	self .temporal_pool_norm =nn .LayerNorm (self .hidden_size )


	self .frame_pos_embed =nn .Parameter (torch .randn (1 ,max_frames ,self .hidden_size )*0.02 )

	print (f" 🎬 Video encoder: max {max_frames } frames (multi-scale enabled)")

	def _load_from_state_dict (self ,state_dict ,prefix ,local_metadata ,strict ,missing_keys ,unexpected_keys ,error_msgs ):
	"""Production-grade hook to handle dynamic frame counts when loading checkpoints.
	Interpolates temporal embeddings if the checkpoint frames differ from max_frames.
	"""

	embed_key =prefix +'frame_pos_embed'
	if embed_key in state_dict :
	ckpt_embed =state_dict [embed_key ]
	if ckpt_embed .shape !=self .frame_pos_embed .shape :
	print (f" ⚠️ VideoEncoder: Interpolating {embed_key } from {ckpt_embed .shape [1 ]} to {self .max_frames } frames.")

	ckpt_embed =ckpt_embed .transpose (1 ,2 )
	resized =F .interpolate (ckpt_embed ,size =self .max_frames ,mode ='linear',align_corners =False )
	state_dict [embed_key ]=resized .transpose (1 ,2 )

	super ()._load_from_state_dict (state_dict ,prefix ,local_metadata ,strict ,missing_keys ,unexpected_keys ,error_msgs )

	def _extract_frame_features (self ,frames :torch .Tensor )->torch .Tensor :
	"""Extract per-frame features using vision encoder."""
	batch_size ,num_frames =frames .shape [:2 ]


	frames_flat =frames .view (-1 ,*frames .shape [2 :])


	if frames_flat .shape [-1 ]!=self .image_size or frames_flat .shape [-2 ]!=self .image_size :
	frames_flat =F .interpolate (
	frames_flat ,
	size =(self .image_size ,self .image_size ),
	mode ='bilinear',
	align_corners =False
	)


	if not any (p .requires_grad for p in self .vision_encoder .parameters ()):
	with torch .no_grad ():
	frame_features =self .vision_encoder (frames_flat ,return_titok =False )
	else :
	frame_features =self .vision_encoder (frames_flat ,return_titok =False )

	return frame_features ,batch_size ,num_frames

	def forward (
	self ,
	frames :torch .Tensor ,
	return_all_frames :bool =False ,
	causal :bool =False ,
	return_tokens :bool =False ,
	)->torch .Tensor :
	"""
	Process video frames with 3D-RoPE and Causal Attention.

	Args:
	frames: [B, T, C, H, W] tensor of video frames
	return_all_frames: If True, return all frame features; else return pooled
	causal: If True, use causal attention (for autoregressive)
	return_tokens: If True, return VideoTokenizer compressed tokens

	Returns:
	If return_tokens: [B, num_tokens, hidden_size] video tokens
	If return_all_frames: [B, T, hidden_size] per-frame features
	Else: [B, hidden_size] pooled video representation
	"""
	frame_features ,batch_size ,num_frames =self ._extract_frame_features (frames )


	_ ,num_patches ,hidden_size =frame_features .shape
	height =width =int (math .sqrt (num_patches ))


	frame_features =frame_features .view (batch_size ,num_frames ,num_patches ,hidden_size )


	frame_features =frame_features +self .frame_pos_embed [:,:num_frames ].unsqueeze (2 )


	x =frame_features .view (batch_size ,num_frames *num_patches ,hidden_size )


	for block in self .encoder_blocks :
	x =block (x ,height ,width ,num_frames ,causal =causal )


	if return_tokens and self .vidtok is not None :
	return self .vidtok (x ,num_frames )

	if return_all_frames :

	x =x .view (batch_size ,num_frames ,num_patches ,hidden_size )
	return x .mean (dim =2 )
	else :

	query =self .temporal_pool_query .expand (batch_size ,-1 ,-1 )
	pooled ,_ =self .temporal_pool_attn (query ,x ,x )
	pooled =self .temporal_pool_norm (query +pooled )
	return pooled .squeeze (1 )

	def encode_frames_separately (self ,frames :torch .Tensor )->torch .Tensor :
	"""
	Encode frames without temporal attention (for generation conditioning).

	Args:
	frames: [B, T, C, H, W] tensor of video frames

	Returns:
	[B, T, hidden_size] tensor of frame features
	"""
	frame_features ,batch_size ,num_frames =self ._extract_frame_features (frames )


	frame_features =frame_features .mean (dim =1 )
	return frame_features .view (batch_size ,num_frames ,-1 )

	def encode_with_spatial (self ,frames :torch .Tensor )->torch .Tensor :
	"""
	Encode frames preserving spatial structure (for video generation).

	Args:
	frames: [B, T, C, H, W] tensor of video frames

	Returns:
	[B, T, H, W, hidden_size] tensor of spatio-temporal features
	"""
	frame_features ,batch_size ,num_frames =self ._extract_frame_features (frames )

	_ ,num_patches ,hidden_size =frame_features .shape
	height =width =int (math .sqrt (num_patches ))


	frame_features =frame_features .view (batch_size ,num_frames ,height ,width ,hidden_size )

	return frame_features


	==============================================================================
	MODELS.ENCODERS.AUDIO
	==============================================================================

	EPS =1e-5


	class RawWaveformTokenizer (nn .Module ):
	"""
	Raw Waveform Tokenizer - directly tokenizes audio waveforms without mel spectrograms.

	Uses multi-scale 1D convolutions to extract features at different temporal resolutions,
	then combines them into a unified representation.
	"""

	def __init__ (
	self ,
	hidden_size :int =1024 ,
	num_codebooks :int =8 ,
	codebook_size :int =1024 ,
	sample_rate :int =16000 ,
	hop_length :int =320 ,
	num_conv_layers :int =6 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_codebooks =num_codebooks
	self .codebook_size =codebook_size
	self .sample_rate =sample_rate
	self .hop_length =hop_length


	self .conv_layers =nn .ModuleList ()
	in_channels =1
	channels =[32 ,64 ,128 ,256 ,512 ,hidden_size ]
	kernel_sizes =[7 ,5 ,5 ,3 ,3 ,3 ]
	strides =[2 ,2 ,2 ,2 ,2 ,2 ]

	for i in range (num_conv_layers ):
	out_channels =channels [i ]if i <len (channels )else hidden_size
	kernel_size =kernel_sizes [i ]if i <len (kernel_sizes )else 3
	stride =strides [i ]if i <len (strides )else 2

	self .conv_layers .append (nn .Sequential (
	nn .Conv1d (in_channels ,out_channels ,kernel_size ,stride ,kernel_size //2 ),
	nn .GroupNorm (8 if out_channels >=8 else 1 ,out_channels ),
	nn .SiLU (),
	))
	in_channels =out_channels


	self .codebooks =nn .ModuleList ([
	nn .Embedding (codebook_size ,hidden_size )
	for _ in range (num_codebooks )
	])


	self .commitment_weight =0.25


	self .output_proj =nn .Linear (hidden_size ,hidden_size )

	print (f" 🎵 RawWaveformTokenizer: {num_codebooks } codebooks x {codebook_size } codes")

	def encode (self ,waveform :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ]:
	"""
	Encode waveform to continuous features.

	Args:
	waveform: [B, T] or [B, 1, T] raw audio waveform

	Returns:
	features: [B, T', hidden_size] encoded features
	indices: [B, T', num_codebooks] quantized indices
	"""
	if waveform .dim ()==2 :
	waveform =waveform .unsqueeze (1 )

	x =waveform
	for conv in self .conv_layers :
	x =conv (x )


	x =x .transpose (1 ,2 )

	return x ,None

	def quantize (self ,features :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ,torch .Tensor ]:
	"""
	Residual Vector Quantization.

	Args:
	features: [B, T, hidden_size] continuous features

	Returns:
	quantized: [B, T, hidden_size] quantized features
	indices: [B, T, num_codebooks] codebook indices
	commitment_loss: scalar commitment loss
	"""
	batch_size ,seq_len ,_ =features .shape
	residual =features
	quantized =torch .zeros_like (features )
	all_indices =[]
	total_commitment_loss =0.0

	for codebook in self .codebooks :

	distances =torch .cdist (residual ,codebook .weight )
	indices =distances .argmin (dim =-1 )
	all_indices .append (indices )


	quantized_step =codebook (indices )


	quantized =quantized +residual +(quantized_step -residual ).detach ()


	commitment_loss =F .mse_loss (residual .detach (),quantized_step )
	total_commitment_loss =total_commitment_loss +commitment_loss


	residual =residual -quantized_step .detach ()

	indices =torch .stack (all_indices ,dim =-1 )
	commitment_loss =total_commitment_loss *self .commitment_weight

	return quantized ,indices ,commitment_loss

	def forward (self ,waveform :torch .Tensor ,quantize :bool =False )->Tuple [torch .Tensor ,Optional [torch .Tensor ]]:
	"""
	Forward pass.

	Args:
	waveform: [B, T] or [B, 1, T] raw audio
	quantize: Whether to apply vector quantization

	Returns:
	features: [B, T', hidden_size] encoded features
	commitment_loss: Optional commitment loss if quantize=True
	"""
	features ,_ =self .encode (waveform )

	if quantize :
	features ,indices ,commitment_loss =self .quantize (features )
	features =self .output_proj (features )
	return features ,commitment_loss

	features =self .output_proj (features )
	return features ,None


	class SnakeActivation (nn .Module ):
	"""
	Snake activation function from BigVGAN.
	x + (1/a) * sin^2(a * x)
	Better than ReLU/SiLU for audio generation - preserves periodicity.
	"""
	def __init__ (self ,channels :int ,alpha :float =1.0 ):
	super ().__init__ ()
	self .alpha =nn .Parameter (torch .ones (1 ,channels ,1 )*alpha )

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	return x +(1.0 /(self .alpha +1e-6 ))torch .sin (self .alpha x )**2


	class ResidualBlock1D (nn .Module ):
	"""Residual block with dilated convolutions for multi-receptive field."""
	def __init__ (self ,channels :int ,kernel_size :int =3 ,dilation :int =1 ):
	super ().__init__ ()
	padding =(kernel_size *dilation -dilation )//2
	self .conv1 =nn .utils .parametrizations .weight_norm (
	nn .Conv1d (channels ,channels ,kernel_size ,padding =padding ,dilation =dilation )
	)
	self .conv2 =nn .utils .parametrizations .weight_norm (
	nn .Conv1d (channels ,channels ,kernel_size ,padding =kernel_size //2 )
	)
	self .activation =SnakeActivation (channels )

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	residual =x
	x =self .activation (self .conv1 (x ))
	x =self .activation (self .conv2 (x ))
	return x +residual


	class MultiReceptiveFieldFusion (nn .Module ):
	"""
	Multi-Receptive Field Fusion (MRF) from HiFi-GAN.
	Processes input through multiple parallel residual stacks with different
	kernel sizes and dilations, then sums results.
	"""
	def __init__ (self ,channels :int ,kernel_sizes :List [int ]=[3 ,7 ,11 ],
	dilations :List [List [int ]]=[[1 ,3 ,5 ],[1 ,3 ,5 ],[1 ,3 ,5 ]]):
	super ().__init__ ()
	self .num_kernels =len (kernel_sizes )
	self .resblocks =nn .ModuleList ()

	for k ,d_list in zip (kernel_sizes ,dilations ):
	blocks =nn .ModuleList ([
	ResidualBlock1D (channels ,k ,d )for d in d_list
	])
	self .resblocks .append (blocks )

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	out =None
	for blocks in self .resblocks :
	h =x
	for block in blocks :
	h =block (h )
	out =h if out is None else out +h
	return out /self .num_kernels


	class RawWaveformDecoder (nn .Module ):
	"""
	SOTA Raw Waveform Decoder - BigVGAN/HiFi-GAN style architecture.

	Converts features directly to playable audio waveform without external vocoder.

	SOTA Features:
	- Snake activation (BigVGAN) - preserves audio periodicity
	- Multi-Receptive Field Fusion (HiFi-GAN) - captures patterns at multiple scales
	- Weight normalization - stable training
	- Efficient upsampling with careful kernel/stride ratios
	- Anti-aliased resampling
	- Streaming-capable architecture

	Speed optimizations:
	- Fewer layers with smarter architecture
	- Fused operations where possible
	- Efficient 256x total upsampling (vs 64x before)
	"""

	def __init__ (
	self ,
	hidden_size :int =1024 ,
	sample_rate :int =16000 ,
	upsample_rates :List [int ]=[8 ,8 ,2 ,2 ],
	upsample_kernel_sizes :List [int ]=[16 ,16 ,4 ,4 ],
	resblock_kernel_sizes :List [int ]=[3 ,7 ,11 ],
	resblock_dilations :List [List [int ]]=[[1 ,3 ,5 ],[1 ,3 ,5 ],[1 ,3 ,5 ]],
	initial_channels :int =512 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .sample_rate =sample_rate
	self .num_upsamples =len (upsample_rates )


	self .input_proj =nn .utils .parametrizations .weight_norm (
	nn .Conv1d (hidden_size ,initial_channels ,kernel_size =7 ,padding =3 )
	)


	self .upsamplers =nn .ModuleList ()
	self .mrf_blocks =nn .ModuleList ()

	channels =initial_channels
	for i ,(rate ,kernel )in enumerate (zip (upsample_rates ,upsample_kernel_sizes )):

	self .upsamplers .append (
	nn .utils .parametrizations .weight_norm (
	nn .ConvTranspose1d (
	channels ,channels //2 ,
	kernel_size =kernel ,stride =rate ,
	padding =(kernel -rate )//2
	)
	)
	)
	channels =channels //2


	self .mrf_blocks .append (
	MultiReceptiveFieldFusion (channels ,resblock_kernel_sizes ,resblock_dilations )
	)


	self .final_activation =SnakeActivation (channels )
	self .output_conv =nn .utils .parametrizations .weight_norm (
	nn .Conv1d (channels ,1 ,kernel_size =7 ,padding =3 )
	)


	self .upsample_factor =1
	for rate in upsample_rates :
	self .upsample_factor *=rate

	print (f" 🔊 RawWaveformDecoder (SOTA BigVGAN-style):")
	print (f" - Snake activation for audio periodicity")
	print (f" - Multi-Receptive Field Fusion")
	print (f" - {self .upsample_factor }x upsampling")
	print (f" - Weight normalized layers")

	def forward (
	self ,
	features :torch .Tensor ,
	target_length :Optional [int ]=None ,
	)->torch .Tensor :
	"""
	Decode features to raw waveform.

	Args:
	features: [B, T, hidden_size] encoded features
	target_length: Optional target waveform length (for matching input length)

	Returns:
	waveform: [B, T_audio] raw audio waveform in [-1, 1]
	"""

	x =features .transpose (1 ,2 )


	x =self .input_proj (x )


	for upsample ,mrf in zip (self .upsamplers ,self .mrf_blocks ):
	x =upsample (x )
	x =mrf (x )


	x =self .final_activation (x )
	waveform =self .output_conv (x )
	waveform =torch .tanh (waveform )


	waveform =waveform .squeeze (1 )


	if target_length is not None and waveform .shape [-1 ]!=target_length :
	waveform =F .interpolate (
	waveform .unsqueeze (1 ),
	size =target_length ,
	mode ='linear',
	align_corners =False
	).squeeze (1 )

	return waveform

	def decode_from_codes (
	self ,
	codes :torch .Tensor ,
	codebooks :nn .ModuleList ,
	target_length :Optional [int ]=None ,
	)->torch .Tensor :
	"""
	Decode directly from codebook indices.

	Args:
	codes: [B, T, num_codebooks] codebook indices
	codebooks: List of nn.Embedding codebooks from encoder
	target_length: Optional target waveform length

	Returns:
	waveform: [B, T_audio] raw audio waveform
	"""

	features =torch .zeros (
	codes .shape [0 ],codes .shape [1 ],codebooks [0 ].embedding_dim ,
	device =codes .device ,dtype =codebooks [0 ].weight .dtype
	)

	for i ,codebook in enumerate (codebooks ):
	features =features +codebook (codes [:,:,i ])

	return self .forward (features ,target_length )

	@torch .no_grad ()
	def stream_decode (
	self ,
	features :torch .Tensor ,
	chunk_size :int =10 ,
	)->torch .Tensor :
	"""
	Streaming decode for real-time speech synthesis.

	Processes features in chunks for low-latency output.

	Args:
	features: [B, T, hidden_size] encoded features
	chunk_size: Number of feature frames per chunk

	Yields:
	waveform_chunk: [B, chunk_audio_len] audio chunk
	"""
	batch_size ,seq_len ,_ =features .shape
	audio_chunks =[]

	for start in range (0 ,seq_len ,chunk_size ):
	end =min (start +chunk_size ,seq_len )
	chunk =features [:,start :end ,:]
	audio_chunk =self .forward (chunk )
	audio_chunks .append (audio_chunk )

	return torch .cat (audio_chunks ,dim =-1 )


	class SpeakerEncoder (nn .Module ):
	"""
	Zero-Shot Speaker Encoder for speaker cloning.

	Extracts speaker embeddings from reference audio that can be used
	to clone the speaker's voice characteristics.
	"""

	def __init__ (
	self ,
	hidden_size :int =256 ,
	output_size :int =256 ,
	num_layers :int =3 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .output_size =output_size




	self .frame_encoder =nn .Sequential (
	nn .Conv1d (80 ,hidden_size ,5 ,1 ,2 ),
	nn .ReLU (),
	nn .GroupNorm (1 ,hidden_size ),
	nn .Conv1d (hidden_size ,hidden_size ,5 ,1 ,2 ),
	nn .ReLU (),
	nn .GroupNorm (1 ,hidden_size ),
	nn .Conv1d (hidden_size ,hidden_size ,5 ,1 ,2 ),
	nn .ReLU (),
	nn .GroupNorm (1 ,hidden_size ),
	)


	self .lstm =nn .LSTM (
	hidden_size ,hidden_size ,
	num_layers =num_layers ,
	batch_first =True ,
	bidirectional =True ,
	)


	self .attention =nn .Sequential (
	nn .Linear (hidden_size *2 ,hidden_size ),
	nn .Tanh (),
	nn .Linear (hidden_size ,1 ),
	)


	self .output_proj =nn .Linear (hidden_size *2 ,output_size )

	print (f" 👤 SpeakerEncoder: {hidden_size }d -> {output_size }d speaker embedding")

	def forward (self ,mel_spectrogram :torch .Tensor )->torch .Tensor :
	"""
	Extract speaker embedding from mel spectrogram.

	Args:
	mel_spectrogram: [B, n_mels, T] mel spectrogram

	Returns:
	speaker_embedding: [B, output_size] speaker embedding
	"""

	x =self .frame_encoder (mel_spectrogram )
	x =x .transpose (1 ,2 )


	x ,_ =self .lstm (x )


	attn_weights =self .attention (x )
	attn_weights =F .softmax (attn_weights ,dim =1 )
	x =(x *attn_weights ).sum (dim =1 )


	speaker_embedding =self .output_proj (x )
	speaker_embedding =F .normalize (speaker_embedding ,p =2 ,dim =-1 )

	return speaker_embedding


	class MonotonicAlignmentSearch (nn .Module ):
	"""
	Monotonic Alignment Search (MAS) for text-to-audio alignment.

	Implements both:
	1. Hard MAS for inference (dynamic programming)
	2. Soft/Fluid MAS for training (differentiable)
	"""

	def __init__ (self ,hidden_size :int =1024 ):
	super ().__init__ ()
	self .hidden_size =hidden_size


	self .alignment_proj =nn .Sequential (
	nn .Linear (hidden_size *2 ,hidden_size ),
	nn .ReLU (),
	nn .Linear (hidden_size ,1 ),
	)



	self .duration_predictor =nn .Sequential (
	nn .Conv1d (hidden_size ,hidden_size ,3 ,padding =1 ),
	nn .ReLU (),
	nn .GroupNorm (1 ,hidden_size ),
	nn .Conv1d (hidden_size ,hidden_size ,3 ,padding =1 ),
	nn .ReLU (),
	nn .GroupNorm (1 ,hidden_size ),
	nn .Conv1d (hidden_size ,1 ,1 ),
	)

	@staticmethod
	def hard_mas (log_probs :torch .Tensor )->torch .Tensor :
	"""
	Hard Monotonic Alignment Search using dynamic programming.

	Args:
	log_probs: [B, T_text, T_audio] log alignment probabilities

	Returns:
	alignment: [B, T_text, T_audio] hard alignment matrix
	"""
	batch_size ,text_len ,audio_len =log_probs .shape
	device =log_probs .device


	Q =torch .full ((batch_size ,text_len ,audio_len ),float ('-inf'),device =device )
	Q [:,0 ,0 ]=log_probs [:,0 ,0 ]

	for j in range (1 ,audio_len ):
	Q [:,0 ,j ]=Q [:,0 ,j -1 ]+log_probs [:,0 ,j ]

	for i in range (1 ,text_len ):
	Q [:,i ,i ]=Q [:,i -1 ,i -1 ]+log_probs [:,i ,i ]
	for j in range (i +1 ,audio_len ):
	Q [:,i ,j ]=torch .max (Q [:,i -1 ,j -1 ],Q [:,i ,j -1 ])+log_probs [:,i ,j ]


	alignment =torch .zeros_like (log_probs )
	for b in range (batch_size ):
	i ,j =text_len -1 ,audio_len -1
	while i >=0 and j >=0 :
	alignment [b ,i ,j ]=1
	if i ==0 :
	j -=1
	elif j ==0 :
	i -=1
	elif Q [b ,i -1 ,j -1 ]>=Q [b ,i ,j -1 ]:
	i -=1
	j -=1
	else :
	j -=1

	return alignment

	def soft_mas (
	self ,
	text_hidden :torch .Tensor ,
	audio_hidden :torch .Tensor ,
	temperature :float =1.0 ,
	)->torch .Tensor :
	"""
	Soft/Differentiable Monotonic Alignment Search.

	Args:
	text_hidden: [B, T_text, hidden_size] text features
	audio_hidden: [B, T_audio, hidden_size] audio features
	temperature: Softmax temperature

	Returns:
	soft_alignment: [B, T_text, T_audio] soft alignment matrix
	"""
	batch_size ,text_len ,_ =text_hidden .shape
	audio_len =audio_hidden .shape [1 ]


	text_expanded =text_hidden .unsqueeze (2 ).expand (-1 ,-1 ,audio_len ,-1 )
	audio_expanded =audio_hidden .unsqueeze (1 ).expand (-1 ,text_len ,-1 ,-1 )
	combined =torch .cat ([text_expanded ,audio_expanded ],dim =-1 )


	logits =self .alignment_proj (combined ).squeeze (-1 )



	logits =logits /temperature


	position_bias =torch .arange (audio_len ,device =logits .device ).float ()
	position_bias =position_bias .unsqueeze (0 ).unsqueeze (0 )

	text_positions =torch .arange (text_len ,device =logits .device ).float ()
	text_positions =text_positions .unsqueeze (0 ).unsqueeze (2 )


	expected_pos =text_positions *(audio_len /text_len )
	monotonic_bias =-0.1 *(position_bias -expected_pos ).abs ()

	logits =logits +monotonic_bias
	soft_alignment =F .softmax (logits ,dim =-1 )

	return soft_alignment

	def predict_durations (self ,text_hidden :torch .Tensor )->torch .Tensor :
	"""
	Predict durations for each text token.

	Args:
	text_hidden: [B, T_text, hidden_size] text features

	Returns:
	durations: [B, T_text] predicted durations
	"""
	x =text_hidden .transpose (1 ,2 )
	durations =self .duration_predictor (x ).squeeze (1 )
	durations =F .softplus (durations )
	return durations

	def forward (
	self ,
	text_hidden :torch .Tensor ,
	audio_hidden :Optional [torch .Tensor ]=None ,
	use_hard :bool =False ,
	)->Tuple [torch .Tensor ,torch .Tensor ]:
	"""
	Compute alignment and durations.

	Args:
	text_hidden: [B, T_text, hidden_size] text features
	audio_hidden: [B, T_audio, hidden_size] audio features (optional for inference)
	use_hard: Use hard MAS instead of soft

	Returns:
	alignment: [B, T_text, T_audio] alignment matrix
	durations: [B, T_text] predicted durations
	"""
	durations =self .predict_durations (text_hidden )

	if audio_hidden is None :

	return None ,durations

	if use_hard :

	text_norm =F .normalize (text_hidden ,dim =-1 )
	audio_norm =F .normalize (audio_hidden ,dim =-1 )
	log_probs =torch .bmm (text_norm ,audio_norm .transpose (1 ,2 ))
	alignment =self .hard_mas (log_probs )
	else :
	alignment =self .soft_mas (text_hidden ,audio_hidden )

	return alignment ,durations


	class RotaryMultiHeadLatentAttention (nn .Module ):
	"""
	Rotary Multi-Head Latent Attention (RMLA).

	Combines:
	- Multi-Head Latent Attention (MLA) for compressed KV cache
	- Rotary Position Embeddings (RoPE) for position awareness
	- Efficient attention computation
	"""

	def __init__ (
	self ,
	hidden_size :int =1024 ,
	num_heads :int =16 ,
	num_kv_heads :int =4 ,
	head_dim :int =64 ,
	kv_lora_rank :int =256 ,
	max_position_embeddings :int =8192 ,
	dropout :float =0.1 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_heads =num_heads
	self .num_kv_heads =num_kv_heads
	self .head_dim =head_dim
	self .kv_lora_rank =kv_lora_rank
	self .num_key_value_groups =num_heads //num_kv_heads


	self .q_proj =nn .Linear (hidden_size ,num_heads *head_dim ,bias =False )


	self .kv_a_proj =nn .Linear (hidden_size ,kv_lora_rank +head_dim ,bias =False )
	self .kv_b_proj =nn .Linear (kv_lora_rank ,num_kv_heads head_dim 2 ,bias =False )
	self .kv_norm =nn .LayerNorm (kv_lora_rank )


	self .o_proj =nn .Linear (num_heads *head_dim ,hidden_size ,bias =False )


	self .rotary_emb =self ._create_rotary_embedding (head_dim ,max_position_embeddings )

	self .dropout =nn .Dropout (dropout )
	self .scale =head_dim **-0.5

	def _create_rotary_embedding (self ,dim :int ,max_seq_len :int )->nn .Module :
	"""Create rotary position embeddings."""
	inv_freq =1.0 /(10000 **(torch .arange (0 ,dim ,2 ).float ()/dim ))
	self .register_buffer ('inv_freq',inv_freq )

	t =torch .arange (max_seq_len ).float ()
	freqs =torch .einsum ('i,j->ij',t ,inv_freq )
	emb =torch .cat ([freqs ,freqs ],dim =-1 )
	self .register_buffer ('cos_cached',emb .cos ())
	self .register_buffer ('sin_cached',emb .sin ())

	return None

	def _apply_rotary (self ,x :torch .Tensor ,seq_len :int )->torch .Tensor :
	"""Apply rotary position embeddings."""
	cos =self .cos_cached [:seq_len ].unsqueeze (0 ).unsqueeze (0 )
	sin =self .sin_cached [:seq_len ].unsqueeze (0 ).unsqueeze (0 )


	x1 ,x2 =x [...,:x .shape [-1 ]//2 ],x [...,x .shape [-1 ]//2 :]
	rotated =torch .cat ([-x2 ,x1 ],dim =-1 )

	return x cos .to (x .dtype )+rotated sin .to (x .dtype )

	def forward (
	self ,
	hidden_states :torch .Tensor ,
	attention_mask :Optional [torch .Tensor ]=None ,
	past_key_value :Optional [Tuple [torch .Tensor ,torch .Tensor ]]=None ,
	use_cache :bool =False ,
	)->Tuple [torch .Tensor ,Optional [Tuple [torch .Tensor ,torch .Tensor ]]]:
	"""
	Forward pass with RMLA.

	Args:
	hidden_states: [B, T, hidden_size]
	attention_mask: Optional attention mask
	past_key_value: Optional cached KV states
	use_cache: Whether to return updated cache

	Returns:
	output: [B, T, hidden_size]
	present_key_value: Optional updated cache
	"""
	batch_size ,seq_len ,_ =hidden_states .shape


	query =self .q_proj (hidden_states )
	query =query .view (batch_size ,seq_len ,self .num_heads ,self .head_dim ).transpose (1 ,2 )


	kv_compressed =self .kv_a_proj (hidden_states )
	kv_latent ,k_pe =kv_compressed .split ([self .kv_lora_rank ,self .head_dim ],dim =-1 )
	kv_latent =self .kv_norm (kv_latent )
	kv =self .kv_b_proj (kv_latent )

	key ,value =kv .split (self .num_kv_heads *self .head_dim ,dim =-1 )
	key =key .view (batch_size ,seq_len ,self .num_kv_heads ,self .head_dim ).transpose (1 ,2 )
	value =value .view (batch_size ,seq_len ,self .num_kv_heads ,self .head_dim ).transpose (1 ,2 )


	query =self ._apply_rotary (query ,seq_len )
	key =self ._apply_rotary (key ,seq_len )


	if past_key_value is not None :
	past_key ,past_value =past_key_value
	key =torch .cat ([past_key ,key ],dim =2 )
	value =torch .cat ([past_value ,value ],dim =2 )

	present_key_value =(key ,value )if use_cache else None


	qk_scale =self .head_dim **-0.25
	kv_len =key .shape [2 ]
	use_causal =(attention_mask is None and seq_len >1 and seq_len ==kv_len )

	dropout_p =self .dropout .p if self .training else 0.0

	output =F .scaled_dot_product_attention (
	query *qk_scale ,
	key *qk_scale ,
	value ,
	attn_mask =attention_mask ,
	is_causal =use_causal ,
	dropout_p =dropout_p ,
	scale =1.0 ,
	enable_gqa =(self .num_key_value_groups >1 ),
	)
	output =output .transpose (1 ,2 ).contiguous ().view (batch_size ,seq_len ,-1 )
	output =self .o_proj (output )

	return output ,present_key_value


	class InContextAudioPrompting (nn .Module ):
	"""
	In-Context Audio Prompting for conditioning generation on reference audio.

	Allows the model to use a reference audio clip to guide the style,
	speaker characteristics, and prosody of generated audio.
	"""

	def __init__ (
	self ,
	hidden_size :int =1024 ,
	num_prompt_tokens :int =32 ,
	num_heads :int =8 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_prompt_tokens =num_prompt_tokens


	self .prompt_tokens =nn .Parameter (torch .randn (1 ,num_prompt_tokens ,hidden_size )*0.02 )


	self .cross_attn =nn .MultiheadAttention (
	hidden_size ,num_heads ,
	dropout =0.1 ,
	batch_first =True ,
	)


	self .prompt_encoder =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size ),
	nn .SiLU (),
	nn .Linear (hidden_size ,hidden_size ),
	)


	self .gate =nn .Parameter (torch .zeros (1 ))

	self .norm =nn .LayerNorm (hidden_size )

	def encode_prompt (self ,audio_features :torch .Tensor )->torch .Tensor :
	"""
	Encode reference audio into prompt tokens.

	Args:
	audio_features: [B, T, hidden_size] reference audio features

	Returns:
	prompt: [B, num_prompt_tokens, hidden_size] encoded prompt
	"""
	batch_size =audio_features .shape [0 ]


	prompt =self .prompt_tokens .expand (batch_size ,-1 ,-1 )


	prompt ,_ =self .cross_attn (prompt ,audio_features ,audio_features )


	prompt =self .prompt_encoder (prompt )

	return prompt

	def forward (
	self ,
	hidden_states :torch .Tensor ,
	prompt_features :Optional [torch .Tensor ]=None ,
	audio_prompt :Optional [torch .Tensor ]=None ,
	)->torch .Tensor :
	"""
	Apply in-context audio prompting.

	Args:
	hidden_states: [B, T, hidden_size] input features
	prompt_features: [B, num_prompt_tokens, hidden_size] pre-encoded prompt
	audio_prompt: [B, T_prompt, hidden_size] raw audio features to encode

	Returns:
	output: [B, T, hidden_size] conditioned features
	"""
	if prompt_features is None and audio_prompt is not None :
	prompt_features =self .encode_prompt (audio_prompt )

	if prompt_features is None :
	return hidden_states


	attended ,_ =self .cross_attn (hidden_states ,prompt_features ,prompt_features )


	gate =torch .sigmoid (self .gate )
	output =hidden_states +gate *attended
	output =self .norm (output )

	return output


	class ConvolutionModule (nn .Module ):
	"""Conformer convolution module with gating."""

	def __init__ (self ,channels :int ,kernel_size :int =31 ,dropout :float =0.1 ):
	super ().__init__ ()
	self .layer_norm =nn .LayerNorm (channels )
	self .pointwise_conv1 =nn .Conv1d (channels ,2 *channels ,kernel_size =1 )
	self .depthwise_conv =nn .Conv1d (
	channels ,channels ,kernel_size =kernel_size ,
	padding =(kernel_size -1 )//2 ,groups =channels
	)

	self .batch_norm =nn .GroupNorm (1 ,channels )
	self .pointwise_conv2 =nn .Conv1d (channels ,channels ,kernel_size =1 )
	self .dropout =nn .Dropout (dropout )

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	"""x: [B, T, C]"""
	x =self .layer_norm (x )
	x =x .transpose (1 ,2 )


	x =self .pointwise_conv1 (x )
	x =F .glu (x ,dim =1 )


	x =self .depthwise_conv (x )
	x =self .batch_norm (x )
	x =F .silu (x )


	x =self .pointwise_conv2 (x )
	x =self .dropout (x )

	return x .transpose (1 ,2 )


	class ConformerBlock (nn .Module ):
	"""Single Conformer block with RMLA, feed-forward, and convolution."""

	def __init__ (
	self ,
	d_model :int ,
	num_heads :int =8 ,
	ff_expansion :int =4 ,
	conv_kernel_size :int =31 ,
	dropout :float =0.1 ,
	use_rmla :bool =True ,
	):
	super ().__init__ ()
	self .use_rmla =use_rmla


	self .ff1_norm =nn .LayerNorm (d_model )
	self .ff1 =nn .Sequential (
	nn .Linear (d_model ,d_model *ff_expansion ),
	nn .SiLU (),
	nn .Dropout (dropout ),
	nn .Linear (d_model *ff_expansion ,d_model ),
	nn .Dropout (dropout )
	)


	if use_rmla :
	self .attn =RotaryMultiHeadLatentAttention (
	hidden_size =d_model ,
	num_heads =num_heads ,
	num_kv_heads =max (1 ,num_heads //4 ),
	head_dim =d_model //num_heads ,
	kv_lora_rank =d_model //4 ,
	dropout =dropout ,
	)
	else :
	self .attn_norm =nn .LayerNorm (d_model )
	self .attn =nn .MultiheadAttention (d_model ,num_heads ,dropout =dropout ,batch_first =True )

	self .attn_dropout =nn .Dropout (dropout )


	self .conv =ConvolutionModule (d_model ,conv_kernel_size ,dropout )


	self .ff2_norm =nn .LayerNorm (d_model )
	self .ff2 =nn .Sequential (
	nn .Linear (d_model ,d_model *ff_expansion ),
	nn .SiLU (),
	nn .Dropout (dropout ),
	nn .Linear (d_model *ff_expansion ,d_model ),
	nn .Dropout (dropout )
	)

	self .final_norm =nn .LayerNorm (d_model )

	def forward (
	self ,
	x :torch .Tensor ,
	mask :Optional [torch .Tensor ]=None ,
	past_key_value :Optional [Tuple ]=None ,
	use_cache :bool =False ,
	)->Tuple [torch .Tensor ,Optional [Tuple ]]:

	x =x +0.5 *self .ff1 (self .ff1_norm (x ))


	if self .use_rmla :

	attn_mask =None
	if mask is not None :
	attn_mask =mask .unsqueeze (1 ).unsqueeze (2 )
	attn_mask =attn_mask .to (dtype =x .dtype )
	attn_mask =attn_mask .masked_fill (attn_mask .bool (),float ('-inf'))
	attn_out ,present_kv =self .attn (x ,attention_mask =attn_mask ,past_key_value =past_key_value ,use_cache =use_cache )
	else :
	attn_out ,_ =self .attn (self .attn_norm (x ),self .attn_norm (x ),self .attn_norm (x ),key_padding_mask =mask )
	present_kv =None
	x =x +self .attn_dropout (attn_out )


	x =x +self .conv (x )


	x =x +0.5 *self .ff2 (self .ff2_norm (x ))

	return self .final_norm (x ),present_kv


	class AudioEncoder (nn .Module ):
	"""
	SOTA Audio Encoder with Raw Waveform Tokenization, RMLA, and Voice Enhancement.

	Features:
	- Raw waveform tokenization (no mel spectrogram)
	- Conformer blocks with RMLA
	- Zero-shot speaker encoding
	- In-context audio prompting
	- Gradient checkpointing support for memory efficiency

	Voice Enhancement Features (SOTA):
	- Prosody-aware EoT Prediction (interruption detection)
	- AVD Emotion Recognition (arousal/valence/dominance)
	- Dynamic Latent Vocalizations (singing/rapping)
	- Neural Sound Effects (beatboxing, breathing, expressions)
	- Speculative Decoding (mid-stream token rewriting)
	"""

	def __init__ (
	self ,
	hidden_size :int =1024 ,
	n_mels :int =80 ,
	max_audio_length :int =3000 ,
	num_layers :int =6 ,
	num_heads :int =8 ,
	dropout :float =0.1 ,
	use_raw_waveform :bool =True ,

	enable_eot :bool =True ,
	enable_emotion :bool =True ,
	enable_singing :bool =True ,
	enable_effects :bool =True ,
	enable_speculative :bool =True ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .max_audio_length =max_audio_length
	self .use_raw_waveform =use_raw_waveform
	self .gradient_checkpointing =False


	self .enable_eot =enable_eot
	self .enable_emotion =enable_emotion
	self .enable_singing =enable_singing
	self .enable_effects =enable_effects
	self .enable_speculative =enable_speculative


	if use_raw_waveform :
	self .waveform_tokenizer =RawWaveformTokenizer (
	hidden_size =hidden_size ,
	num_codebooks =8 ,
	codebook_size =1024 ,
	)
	else :
	self .waveform_tokenizer =None

	self .conv_subsample =nn .Sequential (
	nn .Conv1d (n_mels ,hidden_size //2 ,kernel_size =3 ,stride =2 ,padding =1 ),
	nn .GELU (),
	nn .Conv1d (hidden_size //2 ,hidden_size ,kernel_size =3 ,stride =2 ,padding =1 ),
	nn .GELU (),
	)


	self .speaker_encoder =SpeakerEncoder (
	hidden_size =256 ,
	output_size =hidden_size //4 ,
	)


	self .audio_prompting =InContextAudioPrompting (
	hidden_size =hidden_size ,
	num_prompt_tokens =32 ,
	)


	self .conformer_blocks =nn .ModuleList ([
	ConformerBlock (
	hidden_size ,num_heads ,
	ff_expansion =4 ,
	conv_kernel_size =31 ,
	dropout =dropout ,
	use_rmla =True ,
	)
	for _ in range (num_layers )
	])


	self .output_proj =nn .Linear (hidden_size ,hidden_size )




	if enable_eot :
	self .eot_predictor =ProsodyAwareEoTPredictor (hidden_size ,dropout =dropout )
	else :
	self .eot_predictor =None


	if enable_emotion :
	self .emotion_recognizer =AVDEmotionRecognizer (hidden_size ,dropout =dropout )
	else :
	self .emotion_recognizer =None


	if enable_singing :
	self .vocalizer =DynamicLatentVocalizer (hidden_size )
	else :
	self .vocalizer =None


	if enable_effects :
	self .effects_generator =NeuralSoundEffectGenerator (hidden_size )
	else :
	self .effects_generator =None


	if enable_speculative :
	self .speculative_decoder =SpeculativeAudioDecoder (hidden_size )
	else :
	self .speculative_decoder =None

	print (f" 🎤 AudioEncoder (RMLA Conformer): {hidden_size }d, {num_layers } layers")
	if use_raw_waveform :
	print (f" - Raw Waveform Tokenizer enabled")
	print (f" - Zero-Shot Speaker Encoder enabled")
	print (f" - In-Context Audio Prompting enabled")
	print (f" - EoT/Interruption Detection: {enable_eot }")
	print (f" - Emotion Recognition (AVD): {enable_emotion }")
	print (f" - Singing/Rapping (Vocalizer): {enable_singing }")
	print (f" - Sound Effects Generator: {enable_effects }")
	print (f" - Speculative Decoding: {enable_speculative }")

	def gradient_checkpointing_enable (self ):
	"""Enable gradient checkpointing to save memory during training."""
	self .gradient_checkpointing =True

	if hasattr (self ,'waveform_tokenizer')and self .waveform_tokenizer is not None :
	if hasattr (self .waveform_tokenizer ,'gradient_checkpointing'):
	self .waveform_tokenizer .gradient_checkpointing =True
	if hasattr (self ,'speaker_encoder')and self .speaker_encoder is not None :
	if hasattr (self .speaker_encoder ,'gradient_checkpointing'):
	self .speaker_encoder .gradient_checkpointing =True

	def gradient_checkpointing_disable (self ):
	"""Disable gradient checkpointing."""
	self .gradient_checkpointing =False

	def forward (
	self ,
	audio_input :torch .Tensor ,
	speaker_ref :Optional [torch .Tensor ]=None ,
	audio_prompt :Optional [torch .Tensor ]=None ,
	mask :Optional [torch .Tensor ]=None ,
	return_eot :bool =False ,
	return_emotion :bool =False ,
	)->Tuple [torch .Tensor ,Optional [torch .Tensor ],Optional [dict ]]:
	"""
	Process audio to features with optional voice enhancement outputs.

	Args:
	audio_input: [B, T] raw waveform or [B, n_mels, T] mel spectrogram
	speaker_ref: [B, n_mels, T_ref] reference audio for speaker cloning
	audio_prompt: [B, T_prompt, hidden_size] audio prompt features
	mask: Optional attention mask
	return_eot: Whether to return EoT/interruption predictions
	return_emotion: Whether to return emotion/AVD predictions

	Returns:
	features: [B, T', hidden_size] audio features
	speaker_embedding: [B, hidden_size//4] speaker embedding (if speaker_ref provided)
	extras: dict with EoT/emotion predictions (if requested)
	"""



	commitment_loss =None

	if self .use_raw_waveform and self .waveform_tokenizer is not None :

	if audio_input .dim ()==3 and audio_input .shape [1 ]==1 :

	audio_input =audio_input .squeeze (1 )
	elif audio_input .dim ()==3 :


	audio_input =audio_input .mean (dim =1 )
	x ,commitment_loss =self .waveform_tokenizer (audio_input )
	elif hasattr (self ,'conv_subsample')and self .conv_subsample is not None :

	if audio_input .dim ()==2 :

	audio_input =audio_input .unsqueeze (1 )
	x =self .conv_subsample (audio_input )
	x =x .transpose (1 ,2 )
	else :
	raise RuntimeError (
	f"AudioEncoder: Incompatible configuration. "
	f"use_raw_waveform={self .use_raw_waveform }, "
	f"waveform_tokenizer={self .waveform_tokenizer is not None }, "
	f"conv_subsample={hasattr (self ,'conv_subsample')and self .conv_subsample is not None }"
	)


	speaker_embedding =None
	if speaker_ref is not None :
	speaker_embedding =self .speaker_encoder (speaker_ref )


	if audio_prompt is not None :
	x =self .audio_prompting (x ,audio_prompt =audio_prompt )


	if self .gradient_checkpointing and self .training :
	from torch .utils .checkpoint import checkpoint
	for block in self .conformer_blocks :

	def create_custom_forward (module ):
	def custom_forward (*inputs ):
	return module (*inputs )
	return custom_forward
	x ,_ =checkpoint (create_custom_forward (block ),x ,mask ,use_reentrant =False )
	else :
	for block in self .conformer_blocks :
	x ,_ =block (x ,mask )


	x =self .output_proj (x )


	extras ={}

	if return_eot and self .eot_predictor is not None :
	extras ["eot"]=self .eot_predictor (x ,mask )

	if return_emotion and self .emotion_recognizer is not None :
	extras ["emotion"]=self .emotion_recognizer (x ,mask )

	return x ,speaker_embedding ,extras if extras else None



	def detect_interruption (
	self ,
	audio_features :torch .Tensor ,
	attention_mask :Optional [torch .Tensor ]=None ,
	)->Optional [dict ]:
	"""
	Detect interruptions, backchannels, and turn-taking events.

	Args:
	audio_features: [B, T, hidden_size] encoded audio
	attention_mask: [B, T] optional mask

	Returns:
	dict with eot_logits, event_logits, vad_logits, backoff_prob
	"""
	if self .eot_predictor is None :
	return None
	return self .eot_predictor (audio_features ,attention_mask )

	def recognize_emotion (
	self ,
	audio_features :torch .Tensor ,
	attention_mask :Optional [torch .Tensor ]=None ,
	)->Optional [dict ]:
	"""
	Recognize emotion with AVD (arousal/valence/dominance) values.

	Args:
	audio_features: [B, T, hidden_size] encoded audio
	attention_mask: [B, T] optional mask

	Returns:
	dict with emotion_logits, arousal, valence, dominance, response_mode
	"""
	if self .emotion_recognizer is None :
	return None
	return self .emotion_recognizer (audio_features ,attention_mask )

	def generate_vocals (
	self ,
	text_features :torch .Tensor ,
	style_id :Optional [torch .Tensor ]=None ,
	mode_id :Optional [torch .Tensor ]=None ,
	target_pitch :Optional [torch .Tensor ]=None ,
	tempo_bpm :Optional [torch .Tensor ]=None ,
	)->Optional [dict ]:
	"""
	Generate singing/rapping vocals from text/lyrics.

	Args:
	text_features: [B, T, hidden_size] text embeddings
	style_id: [B] style indices (pop, rock, jazz, etc.)
	mode_id: [B] mode indices (speak, sing, rap, hum, etc.)
	target_pitch: [B, T] pitch targets
	tempo_bpm: [B] tempo in BPM

	Returns:
	dict with vocal_features, pitch_logits, alignment, durations
	"""
	if self .vocalizer is None :
	return None
	return self .vocalizer (text_features ,style_id ,mode_id ,target_pitch ,tempo_bpm )

	def generate_effects (
	self ,
	effect_ids :torch .Tensor ,
	context :Optional [torch .Tensor ]=None ,
	intensity :Optional [torch .Tensor ]=None ,
	)->Optional [dict ]:
	"""
	Generate sound effects (beatbox, clicks, breathing, etc.).

	Args:
	effect_ids: [B] or [B, N] effect type indices
	context: [B, T, hidden_size] optional context
	intensity: [B] intensity values

	Returns:
	dict with effect_features, waveform, duration, intensity
	"""
	if self .effects_generator is None :
	return None
	return self .effects_generator (effect_ids ,context ,intensity )

	def speculative_generate (
	self ,
	context :torch .Tensor ,
	generate_draft :bool =True ,
	verify_with :Optional [torch .Tensor ]=None ,
	)->Optional [dict ]:
	"""
	Generate speculative draft tokens for mid-stream rewriting.

	Args:
	context: [B, T, hidden_size] current context
	generate_draft: whether to generate new draft
	verify_with: [B, T', hidden_size] new context to verify against

	Returns:
	dict with checkpoint, draft_tokens, confidence, accept_prob
	"""
	if self .speculative_decoder is None :
	return None
	return self .speculative_decoder (context ,generate_draft ,verify_with )


	class VariancePredictor (nn .Module ):
	"""Variance predictor for duration, pitch, and energy."""

	def __init__ (self ,hidden_size :int ,kernel_size :int =3 ,dropout :float =0.1 ):
	super ().__init__ ()
	self .conv1 =nn .Conv1d (hidden_size ,hidden_size ,kernel_size ,padding =kernel_size //2 )
	self .norm1 =nn .LayerNorm (hidden_size )
	self .conv2 =nn .Conv1d (hidden_size ,hidden_size ,kernel_size ,padding =kernel_size //2 )
	self .norm2 =nn .LayerNorm (hidden_size )
	self .dropout =nn .Dropout (dropout )
	self .linear =nn .Linear (hidden_size ,1 )

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	"""x: [B, T, C] -> [B, T]"""
	out =self .conv1 (x .transpose (1 ,2 )).transpose (1 ,2 )
	out =F .relu (out )
	out =self .norm1 (out )
	out =self .dropout (out )

	out =self .conv2 (out .transpose (1 ,2 )).transpose (1 ,2 )
	out =F .relu (out )
	out =self .norm2 (out )
	out =self .dropout (out )

	return self .linear (out ).squeeze (-1 )


	class FFTBlock (nn .Module ):
	"""FFT block for mel decoder."""

	def __init__ (
	self ,
	hidden_size :int ,
	num_heads :int =4 ,
	ff_expansion :int =4 ,
	kernel_size :int =9 ,
	dropout :float =0.1 ,
	):
	super ().__init__ ()


	self .attn =RotaryMultiHeadLatentAttention (
	hidden_size =hidden_size ,
	num_heads =num_heads ,
	num_kv_heads =max (1 ,num_heads //2 ),
	head_dim =hidden_size //num_heads ,
	kv_lora_rank =hidden_size //4 ,
	dropout =dropout ,
	)
	self .attn_norm =nn .LayerNorm (hidden_size )
	self .attn_dropout =nn .Dropout (dropout )


	self .ff_norm =nn .LayerNorm (hidden_size )
	self .ff =nn .Sequential (
	nn .Conv1d (hidden_size ,hidden_size *ff_expansion ,kernel_size ,padding =kernel_size //2 ),
	nn .ReLU (),
	nn .Conv1d (hidden_size *ff_expansion ,hidden_size ,kernel_size ,padding =kernel_size //2 ),
	nn .Dropout (dropout )
	)

	def forward (self ,x :torch .Tensor )->torch .Tensor :

	residual =x
	x =self .attn_norm (x )
	x ,_ =self .attn (x )
	x =residual +self .attn_dropout (x )


	residual =x
	x =self .ff_norm (x )
	x =self .ff (x .transpose (1 ,2 )).transpose (1 ,2 )
	x =residual +x

	return x


	class AudioDecoder (nn .Module ):
	"""
	SOTA Audio Decoder with MAS, Zero-Shot Speaker Cloning, and Voice Enhancement Support.

	Features:
	- Monotonic Alignment Search for text-to-audio alignment
	- Zero-shot speaker cloning via speaker embeddings
	- In-context audio prompting
	- Variance adaptor with duration, pitch, energy prediction
	- RMLA-based FFT blocks
	- Gradient checkpointing support for memory efficiency

	Voice Enhancement Features (matching AudioEncoder):
	- Emotion conditioning for emotional speech synthesis
	- Singing/vocal style synthesis support
	- Sound effect generation and integration
	- Raw waveform output support (optional)
	- Speculative decoding integration
	"""

	def __init__ (
	self ,
	hidden_size :int =1024 ,
	n_mels :int =80 ,
	max_audio_length :int =1000 ,
	num_speakers :int =256 ,
	num_decoder_layers :int =4 ,
	dropout :float =0.1 ,

	enable_emotion :bool =True ,
	enable_singing :bool =True ,
	enable_effects :bool =True ,
	enable_raw_waveform :bool =True ,
	enable_speculative :bool =True ,

	num_emotions :int =10 ,
	num_vocal_styles :int =8 ,
	num_vocal_modes :int =6 ,
	num_effect_types :int =20 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .n_mels =n_mels
	self .max_audio_length =max_audio_length
	self .gradient_checkpointing =False


	self .enable_emotion =enable_emotion
	self .enable_singing =enable_singing
	self .enable_effects =enable_effects
	self .enable_raw_waveform =enable_raw_waveform
	self .enable_speculative =enable_speculative


	self .mas =MonotonicAlignmentSearch (hidden_size )


	self .speaker_embed =nn .Embedding (num_speakers ,hidden_size //4 )


	self .speaker_proj =nn .Linear (hidden_size //4 ,hidden_size //4 )


	self .audio_prompting =InContextAudioPrompting (
	hidden_size =hidden_size ,
	num_prompt_tokens =32 ,
	)




	if enable_emotion :
	self .emotion_embed =nn .Embedding (num_emotions ,hidden_size //4 )

	self .avd_proj =nn .Sequential (
	nn .Linear (3 ,hidden_size //8 ),
	nn .SiLU (),
	nn .Linear (hidden_size //8 ,hidden_size //4 ),
	)
	self .emotion_cond_size =hidden_size //4
	else :
	self .emotion_embed =None
	self .avd_proj =None
	self .emotion_cond_size =0


	if enable_singing :
	self .vocal_style_embed =nn .Embedding (num_vocal_styles ,hidden_size //4 )
	self .vocal_mode_embed =nn .Embedding (num_vocal_modes ,hidden_size //4 )

	self .tempo_proj =nn .Sequential (
	nn .Linear (1 ,hidden_size //8 ),
	nn .SiLU (),
	nn .Linear (hidden_size //8 ,hidden_size //4 ),
	)
	self .singing_cond_size =hidden_size //4
	else :
	self .vocal_style_embed =None
	self .vocal_mode_embed =None
	self .tempo_proj =None
	self .singing_cond_size =0


	if enable_effects :
	self .effect_embed =nn .Embedding (num_effect_types ,hidden_size //4 )
	self .effect_intensity_proj =nn .Sequential (
	nn .Linear (1 ,hidden_size //8 ),
	nn .SiLU (),
	nn .Linear (hidden_size //8 ,hidden_size //4 ),
	)
	self .effect_cond_size =hidden_size //4
	else :
	self .effect_embed =None
	self .effect_intensity_proj =None
	self .effect_cond_size =0


	total_cond_size =hidden_size //4
	total_cond_size +=self .emotion_cond_size
	total_cond_size +=self .singing_cond_size
	total_cond_size +=self .effect_cond_size


	self .input_proj =nn .Linear (hidden_size +total_cond_size ,hidden_size )


	self .encoder_blocks =nn .ModuleList ([
	FFTBlock (hidden_size ,num_heads =4 ,ff_expansion =4 ,dropout =dropout )
	for _ in range (4 )
	])


	self .duration_predictor =VariancePredictor (hidden_size ,dropout =dropout )
	self .pitch_predictor =VariancePredictor (hidden_size ,dropout =dropout )
	self .energy_predictor =VariancePredictor (hidden_size ,dropout =dropout )


	self .pitch_embed =nn .Conv1d (1 ,hidden_size ,kernel_size =9 ,padding =4 )
	self .energy_embed =nn .Conv1d (1 ,hidden_size ,kernel_size =9 ,padding =4 )


	self .decoder_blocks =nn .ModuleList ([
	FFTBlock (hidden_size ,num_heads =4 ,ff_expansion =4 ,dropout =dropout )
	for _ in range (num_decoder_layers )
	])


	self .mel_linear =nn .Linear (hidden_size ,n_mels )




	self .postnet =nn .ModuleList ([
	nn .Sequential (
	nn .Conv1d (n_mels ,256 ,kernel_size =5 ,padding =2 ),
	nn .GroupNorm (1 ,256 ),
	nn .Tanh (),
	),
	nn .Sequential (
	nn .Conv1d (256 ,256 ,kernel_size =5 ,padding =2 ),
	nn .GroupNorm (1 ,256 ),
	nn .Tanh (),
	),
	nn .Sequential (
	nn .Conv1d (256 ,256 ,kernel_size =5 ,padding =2 ),
	nn .GroupNorm (1 ,256 ),
	nn .Tanh (),
	),
	nn .Sequential (
	nn .Conv1d (256 ,256 ,kernel_size =5 ,padding =2 ),
	nn .GroupNorm (1 ,256 ),
	nn .Tanh (),
	),
	nn .Conv1d (256 ,n_mels ,kernel_size =5 ,padding =2 ),
	])


	if enable_raw_waveform :
	self .waveform_decoder =RawWaveformDecoder (
	hidden_size =hidden_size ,
	sample_rate =16000 ,
	)
	else :
	self .waveform_decoder =None


	if enable_speculative :
	self .speculative_decoder =SpeculativeAudioDecoder (
	hidden_size =hidden_size ,
	draft_length =10 ,
	)
	else :
	self .speculative_decoder =None

	print (f" 🔊 AudioDecoder (MAS + RMLA): {hidden_size }d -> {n_mels } mels")
	print (f" - Monotonic Alignment Search enabled")
	print (f" - Zero-Shot Speaker Cloning enabled")
	print (f" - In-Context Audio Prompting enabled")
	print (f" - Emotion Conditioning: {enable_emotion }")
	print (f" - Singing/Vocal Styles: {enable_singing }")
	print (f" - Sound Effects: {enable_effects }")
	print (f" - Raw Waveform Output: {enable_raw_waveform }")
	print (f" - Speculative Decoding: {enable_speculative }")

	def gradient_checkpointing_enable (self ):
	"""Enable gradient checkpointing to save memory during training."""
	self .gradient_checkpointing =True

	def gradient_checkpointing_disable (self ):
	"""Disable gradient checkpointing."""
	self .gradient_checkpointing =False

	def forward (
	self ,
	text_embeds :torch .Tensor ,
	target_length :Optional [int ]=None ,
	speaker :Optional [torch .Tensor ]=None ,
	speaker_embedding :Optional [torch .Tensor ]=None ,
	audio_prompt :Optional [torch .Tensor ]=None ,
	audio_features :Optional [torch .Tensor ]=None ,
	duration_target :Optional [torch .Tensor ]=None ,
	pitch_target :Optional [torch .Tensor ]=None ,
	energy_target :Optional [torch .Tensor ]=None ,
	use_mas :bool =True ,

	emotion_id :Optional [torch .Tensor ]=None ,
	avd_values :Optional [torch .Tensor ]=None ,
	vocal_style_id :Optional [torch .Tensor ]=None ,
	vocal_mode_id :Optional [torch .Tensor ]=None ,
	tempo_bpm :Optional [torch .Tensor ]=None ,
	effect_id :Optional [torch .Tensor ]=None ,
	effect_intensity :Optional [torch .Tensor ]=None ,
	output_waveform :bool =False ,
	use_speculative :bool =False ,
	)->Tuple [torch .Tensor ,torch .Tensor ,Optional [torch .Tensor ],Optional [dict ]]:
	"""
	Generate mel-spectrogram from text embeddings with voice enhancement support.

	Args:
	text_embeds: [B, T, hidden_size] text embeddings
	target_length: target mel length (for training)
	speaker: [B] speaker IDs (for multi-speaker)
	speaker_embedding: [B, hidden_size//4] zero-shot speaker embedding
	audio_prompt: [B, T_prompt, hidden_size] audio prompt features
	audio_features: [B, T_audio, hidden_size] target audio features (for MAS training)
	duration_target: [B, T] ground truth durations
	pitch_target: [B, T'] ground truth pitch
	energy_target: [B, T'] ground truth energy
	use_mas: Whether to use MAS for alignment

	Voice enhancement args:
	emotion_id: [B] discrete emotion category (0-9)
	avd_values: [B, 3] continuous arousal/valence/dominance values
	vocal_style_id: [B] singing style (0-7: pop, rock, jazz, etc.)
	vocal_mode_id: [B] vocal mode (0-5: speak, sing, rap, hum, whistle, chant)
	tempo_bpm: [B] tempo in BPM for singing/rapping
	effect_id: [B] sound effect type (0-19)
	effect_intensity: [B] effect intensity (0-1)
	output_waveform: Whether to also output raw waveform
	use_speculative: Whether to use speculative decoding

	Returns:
	mel: [B, n_mels, T'] generated mel spectrogram
	durations: [B, T] predicted durations
	alignment: [B, T_text, T_audio] alignment matrix (if use_mas and audio_features provided)
	extras: dict with optional outputs (waveform, speculative results)
	"""
	batch_size ,seq_len ,_ =text_embeds .shape
	device =text_embeds .device
	dtype =text_embeds .dtype

	extras ={}


	if speaker_embedding is not None :

	spk_emb =self .speaker_proj (speaker_embedding )
	elif speaker is not None :

	spk_emb =self .speaker_embed (speaker )
	else :

	speaker =torch .zeros (batch_size ,dtype =torch .long ,device =device )
	spk_emb =self .speaker_embed (speaker )

	spk_emb =spk_emb .unsqueeze (1 ).expand (-1 ,seq_len ,-1 ).to (dtype )


	cond_embeds =[spk_emb ]


	if self .enable_emotion :
	if emotion_id is not None :
	emo_emb =self .emotion_embed (emotion_id ).unsqueeze (1 ).expand (-1 ,seq_len ,-1 ).to (dtype )
	elif avd_values is not None :
	emo_emb =self .avd_proj (avd_values .to (dtype )).unsqueeze (1 ).expand (-1 ,seq_len ,-1 )
	else :

	neutral =torch .full ((batch_size ,),6 ,dtype =torch .long ,device =device )
	emo_emb =self .emotion_embed (neutral ).unsqueeze (1 ).expand (-1 ,seq_len ,-1 ).to (dtype )
	cond_embeds .append (emo_emb )


	if self .enable_singing :

	if vocal_style_id is not None :
	style_emb =self .vocal_style_embed (vocal_style_id ).unsqueeze (1 ).expand (-1 ,seq_len ,-1 ).to (dtype )
	else :
	default_style =torch .zeros (batch_size ,dtype =torch .long ,device =device )
	style_emb =self .vocal_style_embed (default_style ).unsqueeze (1 ).expand (-1 ,seq_len ,-1 ).to (dtype )


	if vocal_mode_id is not None :
	mode_emb =self .vocal_mode_embed (vocal_mode_id ).unsqueeze (1 ).expand (-1 ,seq_len ,-1 ).to (dtype )
	else :
	default_mode =torch .zeros (batch_size ,dtype =torch .long ,device =device )
	mode_emb =self .vocal_mode_embed (default_mode ).unsqueeze (1 ).expand (-1 ,seq_len ,-1 ).to (dtype )


	if tempo_bpm is not None :
	tempo_norm =(tempo_bpm .float ()-60 )/120
	tempo_emb =self .tempo_proj (tempo_norm .unsqueeze (-1 ).to (dtype )).unsqueeze (1 ).expand (-1 ,seq_len ,-1 )
	else :
	tempo_emb =torch .zeros (batch_size ,seq_len ,self .hidden_size //4 ,device =device ,dtype =dtype )


	singing_emb =style_emb +mode_emb +tempo_emb
	cond_embeds .append (singing_emb )


	if self .enable_effects :
	if effect_id is not None :
	eff_emb =self .effect_embed (effect_id ).unsqueeze (1 ).expand (-1 ,seq_len ,-1 ).to (dtype )
	if effect_intensity is not None :
	intensity_emb =self .effect_intensity_proj (effect_intensity .unsqueeze (-1 ).to (dtype ))
	eff_emb =eff_emb *intensity_emb .unsqueeze (1 )
	else :
	eff_emb =torch .zeros (batch_size ,seq_len ,self .hidden_size //4 ,device =device ,dtype =dtype )
	cond_embeds .append (eff_emb )


	all_cond =torch .cat (cond_embeds ,dim =-1 )
	x =torch .cat ([text_embeds ,all_cond ],dim =-1 )
	x =self .input_proj (x )


	if audio_prompt is not None :
	x =self .audio_prompting (x ,audio_prompt =audio_prompt )


	if self .gradient_checkpointing and self .training :
	from torch .utils .checkpoint import checkpoint
	for block in self .encoder_blocks :
	def create_custom_forward (module ):
	def custom_forward (*inputs ):
	return module (*inputs )
	return custom_forward
	x =checkpoint (create_custom_forward (block ),x ,use_reentrant =False )
	else :
	for block in self .encoder_blocks :
	x =block (x )


	alignment =None
	if use_mas and audio_features is not None :
	alignment ,durations =self .mas (x ,audio_features ,use_hard =not self .training )
	else :
	_ ,durations =self .mas (x )


	if duration_target is not None :
	durations =duration_target


	pitch_pred =self .pitch_predictor (x )
	energy_pred =F .softplus (self .energy_predictor (x ))



	MIN_MEL_LENGTH =1
	if target_length is not None :
	mel_length =max (MIN_MEL_LENGTH ,target_length )
	else :
	mel_length =int (durations .sum (dim =1 ).max ().item ())
	mel_length =max (16 ,min (mel_length ,self .max_audio_length ))


	x =F .interpolate (x .transpose (1 ,2 ),size =mel_length ,mode ='linear',align_corners =False ).transpose (1 ,2 )


	pitch =pitch_target if pitch_target is not None else pitch_pred
	energy =energy_target if energy_target is not None else energy_pred


	pitch_up =F .interpolate (pitch .unsqueeze (1 ),size =mel_length ,mode ='linear',align_corners =False )
	energy_up =F .interpolate (energy .unsqueeze (1 ),size =mel_length ,mode ='linear',align_corners =False )


	pitch_emb =self .pitch_embed (pitch_up ).transpose (1 ,2 )
	energy_emb =self .energy_embed (energy_up ).transpose (1 ,2 )
	x =x +pitch_emb +energy_emb


	if self .gradient_checkpointing and self .training :
	from torch .utils .checkpoint import checkpoint
	for block in self .decoder_blocks :
	def create_custom_forward (module ):
	def custom_forward (*inputs ):
	return module (*inputs )
	return custom_forward
	x =checkpoint (create_custom_forward (block ),x ,use_reentrant =False )
	else :
	for block in self .decoder_blocks :
	x =block (x )


	mel =self .mel_linear (x ).transpose (1 ,2 )


	mel_post =mel
	for layer in self .postnet :
	mel_post =layer (mel_post )
	mel =mel +mel_post


	if output_waveform and self .waveform_decoder is not None :
	waveform =self .waveform_decoder (x )
	extras ["waveform"]=waveform


	if use_speculative and self .speculative_decoder is not None :
	spec_results =self .speculative_decoder (x )
	extras ["speculative"]=spec_results

	return mel ,durations ,alignment ,extras if extras else None





	class ProsodyAwareEoTPredictor (nn .Module ):
	"""
	Prosody-aware End-of-Turn (EoT) Prediction for real-time interruption detection.

	Detects when a speaker is about to finish their turn, allowing the model to:
	- Detect user interruptions (coughs, laughs, "uh-huh", etc.)
	- Yield the floor when appropriate
	- Adjust response mid-stream

	Uses prosodic features (pitch, energy, rhythm) combined with semantic features.
	"""

	def __init__ (
	self ,
	hidden_size :int =1024 ,
	num_eot_classes :int =5 ,
	prosody_dim :int =128 ,
	num_heads :int =4 ,
	dropout :float =0.1 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_eot_classes =num_eot_classes


	self .pitch_conv =nn .Sequential (
	nn .Conv1d (1 ,prosody_dim //2 ,kernel_size =5 ,padding =2 ),
	nn .SiLU (),
	nn .Conv1d (prosody_dim //2 ,prosody_dim ,kernel_size =3 ,padding =1 ),
	)
	self .energy_conv =nn .Sequential (
	nn .Conv1d (1 ,prosody_dim //2 ,kernel_size =5 ,padding =2 ),
	nn .SiLU (),
	nn .Conv1d (prosody_dim //2 ,prosody_dim ,kernel_size =3 ,padding =1 ),
	)


	self .vad_head =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size //2 ),
	nn .SiLU (),
	nn .Linear (hidden_size //2 ,2 ),
	)


	self .event_classifier =nn .Sequential (
	nn .Linear (hidden_size +prosody_dim *2 ,hidden_size ),
	nn .SiLU (),
	nn .Dropout (dropout ),
	nn .Linear (hidden_size ,hidden_size //2 ),
	nn .SiLU (),
	nn .Linear (hidden_size //2 ,8 ),
	)


	self .temporal_attn =nn .MultiheadAttention (
	embed_dim =hidden_size ,
	num_heads =num_heads ,
	dropout =dropout ,
	batch_first =True ,
	)


	self .eot_head =nn .Sequential (
	nn .Linear (hidden_size +prosody_dim *2 ,hidden_size ),
	nn .SiLU (),
	nn .Dropout (dropout ),
	nn .Linear (hidden_size ,num_eot_classes ),
	)


	self .backoff_head =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size //4 ),
	nn .SiLU (),
	nn .Linear (hidden_size //4 ,1 ),
	nn .Sigmoid (),
	)

	print (f" 🎙️ ProsodyAwareEoTPredictor: {num_eot_classes } turn states, {prosody_dim }d prosody")

	def extract_prosody (self ,audio_features :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ]:
	"""Extract pitch and energy prosodic features."""


	batch_size ,seq_len ,hidden =audio_features .shape


	x =audio_features .transpose (1 ,2 )


	pitch_proxy =x [:,:1 ,:]
	energy_proxy =x .pow (2 ).mean (dim =1 ,keepdim =True )

	pitch_features =self .pitch_conv (pitch_proxy ).transpose (1 ,2 )
	energy_features =self .energy_conv (energy_proxy ).transpose (1 ,2 )

	return pitch_features ,energy_features

	def forward (
	self ,
	audio_features :torch .Tensor ,
	attention_mask :Optional [torch .Tensor ]=None ,
	)->dict :
	"""
	Predict end-of-turn and interruption events.

	Args:
	audio_features: [B, T, hidden_size] encoded audio
	attention_mask: [B, T] optional mask

	Returns:
	dict with:
	- eot_logits: [B, T, num_eot_classes] turn state predictions
	- event_logits: [B, T, 8] interruption event predictions
	- vad_logits: [B, T, 2] voice activity predictions
	- backoff_prob: [B, T, 1] backoff probability
	"""
	batch_size ,seq_len ,_ =audio_features .shape


	pitch_features ,energy_features =self .extract_prosody (audio_features )


	if attention_mask is not None :
	key_padding_mask =~attention_mask .bool ()
	else :
	key_padding_mask =None

	contextualized ,_ =self .temporal_attn (
	audio_features ,audio_features ,audio_features ,
	key_padding_mask =key_padding_mask ,
	)


	combined =torch .cat ([contextualized ,pitch_features ,energy_features ],dim =-1 )


	eot_logits =self .eot_head (combined )
	event_logits =self .event_classifier (combined )
	vad_logits =self .vad_head (contextualized )
	backoff_prob =self .backoff_head (contextualized )

	return {
	"eot_logits":eot_logits ,
	"event_logits":event_logits ,
	"vad_logits":vad_logits ,
	"backoff_prob":backoff_prob ,
	}


	class AVDEmotionRecognizer (nn .Module ):
	"""
	Continuous AVD (Arousal/Valence/Dominance) Emotion Recognition.

	Predicts both discrete emotion categories and continuous AVD values
	for nuanced emotion understanding and response adaptation.
	"""

	def __init__ (
	self ,
	hidden_size :int =1024 ,
	num_emotions :int =10 ,
	num_layers :int =2 ,
	dropout :float =0.1 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_emotions =num_emotions


	self .emotion_query =nn .Parameter (torch .randn (1 ,1 ,hidden_size ))
	self .emotion_attn =nn .MultiheadAttention (
	embed_dim =hidden_size ,
	num_heads =8 ,
	dropout =dropout ,
	batch_first =True ,
	)


	self .temporal_conv =nn .Sequential (
	nn .Conv1d (hidden_size ,hidden_size ,kernel_size =5 ,padding =2 ,groups =8 ),
	nn .SiLU (),
	nn .Conv1d (hidden_size ,hidden_size ,kernel_size =3 ,padding =1 ),
	)


	self .emotion_classifier =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size //2 ),
	nn .SiLU (),
	nn .Dropout (dropout ),
	nn .Linear (hidden_size //2 ,num_emotions ),
	)


	self .arousal_head =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size //4 ),
	nn .SiLU (),
	nn .Linear (hidden_size //4 ,1 ),
	nn .Sigmoid (),
	)

	self .valence_head =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size //4 ),
	nn .SiLU (),
	nn .Linear (hidden_size //4 ,1 ),
	nn .Tanh (),
	)

	self .dominance_head =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size //4 ),
	nn .SiLU (),
	nn .Linear (hidden_size //4 ,1 ),
	nn .Sigmoid (),
	)


	self .response_adaptation =nn .Sequential (
	nn .Linear (hidden_size +3 ,hidden_size //2 ),
	nn .SiLU (),
	nn .Linear (hidden_size //2 ,4 ),
	)

	print (f" 😊 AVDEmotionRecognizer: {num_emotions } emotions + continuous AVD")

	def forward (
	self ,
	audio_features :torch .Tensor ,
	attention_mask :Optional [torch .Tensor ]=None ,
	)->dict :
	"""
	Recognize emotion from audio features.

	Args:
	audio_features: [B, T, hidden_size] encoded audio
	attention_mask: [B, T] optional mask

	Returns:
	dict with:
	- emotion_logits: [B, num_emotions] discrete emotion
	- arousal: [B, 1] arousal value (0-1)
	- valence: [B, 1] valence value (-1 to 1)
	- dominance: [B, 1] dominance value (0-1)
	- response_mode: [B, 4] response adaptation logits
	"""
	batch_size ,seq_len ,_ =audio_features .shape


	x_conv =self .temporal_conv (audio_features .transpose (1 ,2 )).transpose (1 ,2 )
	x =audio_features +x_conv


	query =self .emotion_query .expand (batch_size ,-1 ,-1 )
	if attention_mask is not None :
	key_padding_mask =~attention_mask .bool ()
	else :
	key_padding_mask =None

	emotion_context ,_ =self .emotion_attn (
	query ,x ,x ,
	key_padding_mask =key_padding_mask ,
	)
	emotion_vec =emotion_context .squeeze (1 )


	emotion_logits =self .emotion_classifier (emotion_vec )
	arousal =self .arousal_head (emotion_vec )
	valence =self .valence_head (emotion_vec )
	dominance =self .dominance_head (emotion_vec )


	avd_concat =torch .cat ([emotion_vec ,arousal ,valence ,dominance ],dim =-1 )
	response_mode =self .response_adaptation (avd_concat )

	return {
	"emotion_logits":emotion_logits ,
	"arousal":arousal ,
	"valence":valence ,
	"dominance":dominance ,
	"response_mode":response_mode ,
	}


	class DynamicLatentVocalizer (nn .Module ):
	"""
	Dynamic Latent Vocalizations for singing, rapping, humming, etc.

	Extends speech synthesis to include:
	- Singing with pitch control
	- Rapping with rhythm control
	- Humming, whistling, chanting
	- Musical style transfer
	"""

	def __init__ (
	self ,
	hidden_size :int =1024 ,
	num_styles :int =8 ,
	num_vocal_modes :int =6 ,
	pitch_bins :int =256 ,
	tempo_range :Tuple [int ,int ]=(60 ,180 ),
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_styles =num_styles
	self .num_vocal_modes =num_vocal_modes
	self .pitch_bins =pitch_bins
	self .tempo_range =tempo_range


	self .style_embed =nn .Embedding (num_styles ,hidden_size //4 )


	self .mode_embed =nn .Embedding (num_vocal_modes ,hidden_size //4 )


	self .pitch_embed =nn .Embedding (pitch_bins ,hidden_size //4 )
	self .pitch_predictor =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size //2 ),
	nn .SiLU (),
	nn .Linear (hidden_size //2 ,pitch_bins ),
	)


	self .tempo_encoder =nn .Sequential (
	nn .Linear (1 ,hidden_size //8 ),
	nn .SiLU (),
	nn .Linear (hidden_size //8 ,hidden_size //4 ),
	)


	self .rhythm_attn =nn .MultiheadAttention (
	embed_dim =hidden_size ,
	num_heads =4 ,
	dropout =0.1 ,
	batch_first =True ,
	)


	self .style_transfer =nn .Sequential (
	nn .Linear (hidden_size +hidden_size //2 ,hidden_size ),
	nn .SiLU (),
	nn .Linear (hidden_size ,hidden_size ),
	)


	self .lyrics_aligner =MonotonicAlignmentSearch (hidden_size )


	self .output_proj =nn .Linear (hidden_size ,hidden_size )

	print (f" 🎵 DynamicLatentVocalizer: {num_styles } styles, {num_vocal_modes } modes")

	def forward (
	self ,
	text_features :torch .Tensor ,
	style_id :Optional [torch .Tensor ]=None ,
	mode_id :Optional [torch .Tensor ]=None ,
	target_pitch :Optional [torch .Tensor ]=None ,
	tempo_bpm :Optional [torch .Tensor ]=None ,
	)->dict :
	"""
	Generate vocalization features for singing/rapping/etc.

	Args:
	text_features: [B, T, hidden_size] text/lyrics embeddings
	style_id: [B] style indices (0-7)
	mode_id: [B] vocal mode indices (0-5)
	target_pitch: [B, T] optional pitch targets
	tempo_bpm: [B] optional tempo in BPM

	Returns:
	dict with:
	- vocal_features: [B, T', hidden_size] vocalization features
	- pitch_logits: [B, T, pitch_bins] predicted pitch
	- alignment: [B, T, T'] text-to-audio alignment
	"""
	batch_size ,seq_len ,_ =text_features .shape
	device =text_features .device


	if style_id is None :
	style_id =torch .zeros (batch_size ,dtype =torch .long ,device =device )
	if mode_id is None :
	mode_id =torch .zeros (batch_size ,dtype =torch .long ,device =device )


	style_emb =self .style_embed (style_id ).unsqueeze (1 ).expand (-1 ,seq_len ,-1 )
	mode_emb =self .mode_embed (mode_id ).unsqueeze (1 ).expand (-1 ,seq_len ,-1 )


	if tempo_bpm is not None :
	tempo_norm =(tempo_bpm .float ()-self .tempo_range [0 ])/(self .tempo_range [1 ]-self .tempo_range [0 ])
	tempo_emb =self .tempo_encoder (tempo_norm .unsqueeze (-1 )).unsqueeze (1 ).expand (-1 ,seq_len ,-1 )
	else :
	tempo_emb =torch .zeros (batch_size ,seq_len ,self .hidden_size //4 ,device =device )


	pitch_logits =self .pitch_predictor (text_features )

	if target_pitch is not None :
	pitch_emb =self .pitch_embed (target_pitch )
	else :
	pitch_idx =pitch_logits .argmax (dim =-1 )
	pitch_emb =self .pitch_embed (pitch_idx )


	conditions =torch .cat ([style_emb ,mode_emb ,tempo_emb ,pitch_emb ],dim =-1 )


	combined =torch .cat ([text_features ,conditions ],dim =-1 )
	vocal_features =self .style_transfer (combined )


	vocal_features ,_ =self .rhythm_attn (vocal_features ,vocal_features ,vocal_features )


	alignment ,durations =self .lyrics_aligner (text_features )


	vocal_features =self .output_proj (vocal_features )

	return {
	"vocal_features":vocal_features ,
	"pitch_logits":pitch_logits ,
	"alignment":alignment ,
	"durations":durations ,
	}


	class NeuralSoundEffectGenerator (nn .Module ):
	"""
	Neural Style Transfer for Sound Effects and Non-verbal Vocalizations.

	Generates:
	- Beatboxing (kicks, snares, hi-hats)
	- Vocal clicks, pops, tongue sounds
	- Breathing, sighing, gasping
	- Non-verbal expressions (hmm, aha, wow, etc.)
	- Polyphonic ad-libs and harmonies
	"""

	def __init__ (
	self ,
	hidden_size :int =1024 ,
	num_effect_types :int =20 ,
	num_layers :int =3 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_effect_types =num_effect_types


	self .effect_embed =nn .Embedding (num_effect_types ,hidden_size )








	self .generator =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size *4 ),
	nn .SiLU (),
	nn .Unflatten (1 ,(hidden_size ,4 )),
	nn .ConvTranspose1d (hidden_size ,hidden_size //2 ,4 ,2 ,1 ),
	nn .SiLU (),
	nn .ConvTranspose1d (hidden_size //2 ,hidden_size //4 ,4 ,2 ,1 ),
	nn .SiLU (),
	nn .ConvTranspose1d (hidden_size //4 ,hidden_size //8 ,4 ,2 ,1 ),
	nn .SiLU (),
	nn .ConvTranspose1d (hidden_size //8 ,1 ,4 ,2 ,1 ),
	nn .Tanh (),
	)


	self .duration_head =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size //4 ),
	nn .SiLU (),
	nn .Linear (hidden_size //4 ,1 ),
	nn .Softplus (),
	)


	self .intensity_head =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size //4 ),
	nn .SiLU (),
	nn .Linear (hidden_size //4 ,1 ),
	nn .Sigmoid (),
	)


	self .blend_attn =nn .MultiheadAttention (
	embed_dim =hidden_size ,
	num_heads =4 ,
	batch_first =True ,
	)

	print (f" 🥁 NeuralSoundEffectGenerator: {num_effect_types } effect types")

	def forward (
	self ,
	effect_ids :torch .Tensor ,
	context :Optional [torch .Tensor ]=None ,
	intensity :Optional [torch .Tensor ]=None ,
	)->dict :
	"""
	Generate sound effect features.

	Args:
	effect_ids: [B] or [B, N] effect type indices
	context: [B, T, hidden_size] optional context features
	intensity: [B] or [B, N] optional intensity values

	Returns:
	dict with:
	- effect_features: [B, T', hidden_size] generated features
	- waveform: [B, 1, samples] raw waveform (if generating directly)
	- duration: [B, 1] predicted duration
	"""

	if effect_ids .dim ()==1 :
	effect_ids =effect_ids .unsqueeze (1 )

	batch_size ,num_effects =effect_ids .shape
	device =effect_ids .device


	effect_emb =self .effect_embed (effect_ids )


	if num_effects >1 :
	effect_emb ,_ =self .blend_attn (effect_emb ,effect_emb ,effect_emb )

	effect_vec =effect_emb .mean (dim =1 )


	if context is not None :
	context_vec =context .mean (dim =1 )
	effect_vec =effect_vec +context_vec


	duration =self .duration_head (effect_vec )
	pred_intensity =self .intensity_head (effect_vec )

	if intensity is not None :
	pred_intensity =intensity .unsqueeze (-1 )if intensity .dim ()==1 else intensity


	effect_vec =effect_vec *pred_intensity


	waveform =self .generator (effect_vec )

	return {
	"effect_features":effect_emb ,
	"waveform":waveform ,
	"duration":duration ,
	"intensity":pred_intensity ,
	}


	class SpeculativeAudioDecoder (nn .Module ):
	"""
	Mid-stream Token Rewriting support for Speculative Decoding in audio.

	Allows the model to:
	- Generate draft audio tokens speculatively
	- Accept/reject based on user feedback or context change
	- Rollback and regenerate from checkpoints
	- Smooth transitions during rewrites
	"""

	def __init__ (
	self ,
	hidden_size :int =1024 ,
	draft_length :int =10 ,
	num_heads :int =8 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .draft_length =draft_length


	self .draft_head =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size ),
	nn .SiLU (),
	nn .Linear (hidden_size ,hidden_size ),
	)


	self .verify_head =nn .Sequential (
	nn .Linear (hidden_size *2 ,hidden_size ),
	nn .SiLU (),
	nn .Linear (hidden_size ,1 ),
	nn .Sigmoid (),
	)


	self .checkpoint_encoder =nn .GRU (
	input_size =hidden_size ,
	hidden_size =hidden_size ,
	num_layers =1 ,
	batch_first =True ,
	)


	self .smoother =nn .Sequential (
	nn .Linear (hidden_size *2 ,hidden_size ),
	nn .SiLU (),
	nn .Linear (hidden_size ,hidden_size ),
	)


	self .confidence_head =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size //4 ),
	nn .SiLU (),
	nn .Linear (hidden_size //4 ,1 ),
	nn .Sigmoid (),
	)

	print (f" ⚡ SpeculativeAudioDecoder: draft_length={draft_length }")

	def generate_draft (
	self ,
	context :torch .Tensor ,
	num_tokens :int =None ,
	)->Tuple [torch .Tensor ,torch .Tensor ]:
	"""
	Generate draft tokens speculatively.

	Args:
	context: [B, T, hidden_size] context features
	num_tokens: number of draft tokens (default: self.draft_length)

	Returns:
	draft_tokens: [B, N, hidden_size] draft features
	confidence: [B, N, 1] confidence per token
	"""
	if num_tokens is None :
	num_tokens =self .draft_length

	batch_size =context .shape [0 ]
	device =context .device


	seed =context [:,-1 :,:]

	draft_tokens =[]
	confidences =[]

	current =seed
	for _ in range (num_tokens ):
	draft =self .draft_head (current )
	conf =self .confidence_head (draft )
	draft_tokens .append (draft )
	confidences .append (conf )
	current =draft

	draft_tokens =torch .cat (draft_tokens ,dim =1 )
	confidences =torch .cat (confidences ,dim =1 )

	return draft_tokens ,confidences

	def verify_draft (
	self ,
	draft_tokens :torch .Tensor ,
	new_context :torch .Tensor ,
	)->torch .Tensor :
	"""
	Verify if draft tokens should be accepted given new context.

	Args:
	draft_tokens: [B, N, hidden_size] draft features
	new_context: [B, T, hidden_size] updated context

	Returns:
	accept_prob: [B, N, 1] probability to accept each token
	"""

	context_summary =new_context .mean (dim =1 ,keepdim =True ).expand (-1 ,draft_tokens .shape [1 ],-1 )
	combined =torch .cat ([draft_tokens ,context_summary ],dim =-1 )
	accept_prob =self .verify_head (combined )

	return accept_prob

	def create_checkpoint (self ,hidden_state :torch .Tensor )->torch .Tensor :
	"""Save hidden state for potential rollback."""
	_ ,checkpoint =self .checkpoint_encoder (hidden_state )
	return checkpoint .squeeze (0 )

	def smooth_transition (
	self ,
	old_features :torch .Tensor ,
	new_features :torch .Tensor ,
	)->torch .Tensor :
	"""Create smooth transition between old and new features."""
	combined =torch .cat ([old_features ,new_features ],dim =-1 )
	return self .smoother (combined )

	def forward (
	self ,
	context :torch .Tensor ,
	generate_draft :bool =True ,
	verify_with :Optional [torch .Tensor ]=None ,
	)->dict :
	"""
	Full speculative decoding step.

	Args:
	context: [B, T, hidden_size] current context
	generate_draft: whether to generate new draft
	verify_with: [B, T', hidden_size] new context to verify against

	Returns:
	dict with draft tokens, confidence, verification results
	"""
	results ={}


	results ["checkpoint"]=self .create_checkpoint (context )

	if generate_draft :
	draft ,confidence =self .generate_draft (context )
	results ["draft_tokens"]=draft
	results ["confidence"]=confidence

	if verify_with is not None and "draft_tokens"in results :
	accept_prob =self .verify_draft (results ["draft_tokens"],verify_with )
	results ["accept_prob"]=accept_prob

	return results


	==============================================================================
	MODELS.GENERATORS.IMAGE
	==============================================================================

	EPS =1e-5


	class RoPE2D (nn .Module ):
	"""
	2D Rotary Position Embedding for flexible aspect ratios.
	Encodes (x, y) spatial positions for patch-based DiT.
	"""

	def __init__ (self ,dim :int ,max_height :int =128 ,max_width :int =128 ,base :float =10000.0 ):
	super ().__init__ ()
	self .dim =dim
	self .max_height =max_height
	self .max_width =max_width
	self .base =base

	self .dim_x =dim //2
	self .dim_y =dim -self .dim_x

	inv_freq_x =1.0 /(base **(torch .arange (0 ,self .dim_x ,2 ,dtype =torch .float32 )/self .dim_x ))
	inv_freq_y =1.0 /(base **(torch .arange (0 ,self .dim_y ,2 ,dtype =torch .float32 )/self .dim_y ))

	self .register_buffer ('inv_freq_x',inv_freq_x ,persistent =False )
	self .register_buffer ('inv_freq_y',inv_freq_y ,persistent =False )

	def forward (self ,x :torch .Tensor ,height :int ,width :int )->Tuple [torch .Tensor ,torch .Tensor ]:
	device =x .device
	dtype =x .dtype

	pos_x =torch .arange (width ,device =device ,dtype =torch .float32 )
	pos_y =torch .arange (height ,device =device ,dtype =torch .float32 )

	freqs_x =torch .outer (pos_x ,self .inv_freq_x .to (device ))
	freqs_y =torch .outer (pos_y ,self .inv_freq_y .to (device ))

	freqs_x =torch .cat ([freqs_x ,freqs_x ],dim =-1 )
	freqs_y =torch .cat ([freqs_y ,freqs_y ],dim =-1 )

	cos_2d =torch .zeros (height ,width ,self .dim ,device =device ,dtype =dtype )
	sin_2d =torch .zeros (height ,width ,self .dim ,device =device ,dtype =dtype )

	for y in range (height ):
	for w in range (width ):
	cos_2d [y ,w ,:self .dim_x ]=freqs_x [w ].cos ().to (dtype )
	sin_2d [y ,w ,:self .dim_x ]=freqs_x [w ].sin ().to (dtype )
	cos_2d [y ,w ,self .dim_x :]=freqs_y [y ].cos ().to (dtype )
	sin_2d [y ,w ,self .dim_x :]=freqs_y [y ].sin ().to (dtype )

	cos_2d =cos_2d .view (height *width ,self .dim )
	sin_2d =sin_2d .view (height *width ,self .dim )

	return cos_2d ,sin_2d


	def apply_rope_2d (x :torch .Tensor ,cos :torch .Tensor ,sin :torch .Tensor )->torch .Tensor :
	x1 =x [...,:x .shape [-1 ]//2 ]
	x2 =x [...,x .shape [-1 ]//2 :]
	rotated =torch .cat ((-x2 ,x1 ),dim =-1 )
	return x cos +rotated sin


	class ImageExpert (nn .Module ):
	"""Single expert for DiT with SwiGLU activation."""

	def __init__ (self ,hidden_size :int ,intermediate_size :int ):
	super ().__init__ ()
	self .gate_proj =nn .Linear (hidden_size ,intermediate_size ,bias =False )
	self .up_proj =nn .Linear (hidden_size ,intermediate_size ,bias =False )
	self .down_proj =nn .Linear (intermediate_size ,hidden_size ,bias =False )
	self .act_fn =nn .SiLU ()

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	return self .down_proj (self .act_fn (self .gate_proj (x ))*self .up_proj (x ))


	class ImageMoERouter (nn .Module ):
	"""Router for Image MoE with spatial awareness."""

	def __init__ (self ,hidden_size :int ,num_experts :int =4 ,top_k :int =2 ):
	super ().__init__ ()
	self .num_experts =num_experts
	self .top_k =top_k

	self .norm =nn .LayerNorm (hidden_size )
	self .gate =nn .Linear (hidden_size ,num_experts ,bias =False )
	nn .init .normal_ (self .gate .weight ,mean =0.0 ,std =0.01 )

	def forward (self ,x :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ]:
	x_norm =self .norm (x )
	router_logits =self .gate (x_norm )
	router_probs =F .softmax (router_logits ,dim =-1 ,dtype =x .dtype )

	top_k_probs ,top_k_indices =torch .topk (router_probs ,self .top_k ,dim =-1 )
	top_k_probs =top_k_probs /(top_k_probs .sum (dim =-1 ,keepdim =True )+EPS )

	return top_k_probs ,top_k_indices


	class ImageMoELayer (nn .Module ):
	"""MoE Layer for DiT with shared expert."""

	def __init__ (self ,hidden_size :int ,intermediate_size :int ,num_experts :int =4 ,top_k :int =2 ):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_experts =num_experts
	self .top_k =top_k

	self .router =ImageMoERouter (hidden_size ,num_experts ,top_k )
	self .experts =nn .ModuleList ([
	ImageExpert (hidden_size ,intermediate_size )
	for _ in range (num_experts )
	])
	self .shared_expert =ImageExpert (hidden_size ,intermediate_size )

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	batch_size ,seq_len ,hidden_size =x .shape
	x_flat =x .view (-1 ,hidden_size )

	top_k_probs ,top_k_indices =self .router (x_flat )

	output =torch .zeros_like (x_flat )

	for expert_idx in range (self .num_experts ):
	expert =self .experts [expert_idx ]
	for k in range (self .top_k ):
	mask =(top_k_indices [:,k ]==expert_idx )
	if mask .any ():
	expert_input =x_flat [mask ]
	expert_output =expert (expert_input )
	weight =top_k_probs [mask ,k :k +1 ]
	output [mask ]=output [mask ]+weight *expert_output

	shared_output =self .shared_expert (x_flat )
	output =output +shared_output

	return output .view (batch_size ,seq_len ,hidden_size )


	class DualStreamSelfAttention (nn .Module ):
	"""
	Symmetric Dual-Stream Self-Attention (SD3/Flux-style).
	Two parallel streams with cross-stream information exchange.
	Uses Flash Attention 2.0 via SDPA for O(N) memory.
	"""

	def __init__ (self ,hidden_size :int ,num_heads :int =8 ,max_height :int =64 ,max_width :int =64 ):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_heads =num_heads
	self .head_dim =hidden_size //num_heads
	self .scale =self .head_dim **-0.5

	self ._qk_scale =self .head_dim **-0.25

	self .to_qkv_a =nn .Linear (hidden_size ,hidden_size *3 ,bias =False )
	self .to_qkv_b =nn .Linear (hidden_size ,hidden_size *3 ,bias =False )

	self .to_out_a =nn .Linear (hidden_size ,hidden_size ,bias =False )
	self .to_out_b =nn .Linear (hidden_size ,hidden_size ,bias =False )

	self .norm_a =nn .LayerNorm (hidden_size )
	self .norm_b =nn .LayerNorm (hidden_size )

	self .rope_2d =RoPE2D (self .head_dim ,max_height ,max_width )

	def forward (self ,x_a :torch .Tensor ,x_b :torch .Tensor ,height :int ,width :int )->Tuple [torch .Tensor ,torch .Tensor ]:
	batch_size ,seq_len ,_ =x_a .shape

	x_a =self .norm_a (x_a )
	x_b =self .norm_b (x_b )

	qkv_a =self .to_qkv_a (x_a ).reshape (batch_size ,seq_len ,3 ,self .num_heads ,self .head_dim )
	qkv_b =self .to_qkv_b (x_b ).reshape (batch_size ,seq_len ,3 ,self .num_heads ,self .head_dim )

	q_a ,k_a ,v_a =qkv_a .unbind (dim =2 )
	q_b ,k_b ,v_b =qkv_b .unbind (dim =2 )

	cos ,sin =self .rope_2d (x_a ,height ,width )
	cos =cos .unsqueeze (0 ).unsqueeze (1 )
	sin =sin .unsqueeze (0 ).unsqueeze (1 )

	q_a =q_a .transpose (1 ,2 )
	k_a =k_a .transpose (1 ,2 )
	v_a =v_a .transpose (1 ,2 )
	q_b =q_b .transpose (1 ,2 )
	k_b =k_b .transpose (1 ,2 )
	v_b =v_b .transpose (1 ,2 )

	q_a =apply_rope_2d (q_a ,cos ,sin )
	k_a =apply_rope_2d (k_a ,cos ,sin )
	q_b =apply_rope_2d (q_b ,cos ,sin )
	k_b =apply_rope_2d (k_b ,cos ,sin )

	k_combined =torch .cat ([k_a ,k_b ],dim =2 )
	v_combined =torch .cat ([v_a ,v_b ],dim =2 )


	out_a =F .scaled_dot_product_attention (
	q_a self ._qk_scale ,k_combined self ._qk_scale ,v_combined ,
	is_causal =False ,scale =1.0 ,
	)
	out_b =F .scaled_dot_product_attention (
	q_b self ._qk_scale ,k_combined self ._qk_scale ,v_combined ,
	is_causal =False ,scale =1.0 ,
	)

	out_a =out_a .transpose (1 ,2 ).reshape (batch_size ,seq_len ,self .hidden_size )
	out_b =out_b .transpose (1 ,2 ).reshape (batch_size ,seq_len ,self .hidden_size )

	out_a =self .to_out_a (out_a )
	out_b =self .to_out_b (out_b )

	return out_a ,out_b


	class CrossAttention (nn .Module ):
	"""Cross-attention for text conditioning."""

	def __init__ (self ,query_dim :int ,context_dim :int =None ,heads :int =8 ):
	super ().__init__ ()
	self .heads =heads
	context_dim =context_dim or query_dim
	self .head_dim =query_dim //heads
	self .scale =self .head_dim **-0.5

	self .norm =nn .LayerNorm (query_dim )
	self .to_q =nn .Linear (query_dim ,query_dim ,bias =False )
	self .to_k =nn .Linear (context_dim ,query_dim ,bias =False )
	self .to_v =nn .Linear (context_dim ,query_dim ,bias =False )
	self .to_out =nn .Linear (query_dim ,query_dim ,bias =False )

	def forward (self ,x :torch .Tensor ,context :torch .Tensor )->torch .Tensor :
	batch_size ,seq_len ,_ =x .shape
	ctx_len =context .shape [1 ]

	x =self .norm (x )

	q =self .to_q (x ).reshape (batch_size ,seq_len ,self .heads ,self .head_dim ).transpose (1 ,2 )
	k =self .to_k (context ).reshape (batch_size ,ctx_len ,self .heads ,self .head_dim ).transpose (1 ,2 )
	v =self .to_v (context ).reshape (batch_size ,ctx_len ,self .heads ,self .head_dim ).transpose (1 ,2 )


	qk_scale =self .head_dim **-0.25
	out =F .scaled_dot_product_attention (
	q qk_scale ,k qk_scale ,v ,
	is_causal =False ,scale =1.0 ,
	)
	out =out .transpose (1 ,2 ).reshape (batch_size ,seq_len ,-1 )
	out =self .to_out (out )

	return out


	class DiTBlock (nn .Module ):
	"""
	DiT Block with Dual-Stream Attention and MoE FFN.
	"""

	def __init__ (self ,hidden_size :int ,context_dim :int ,num_heads :int =8 ,num_experts :int =4 ,max_height :int =64 ,max_width :int =64 ):
	super ().__init__ ()

	self .dual_attn =DualStreamSelfAttention (hidden_size ,num_heads ,max_height ,max_width )
	self .cross_attn_a =CrossAttention (hidden_size ,context_dim ,num_heads )
	self .cross_attn_b =CrossAttention (hidden_size ,context_dim ,num_heads )
	self .moe_a =ImageMoELayer (hidden_size ,hidden_size *4 ,num_experts )
	self .moe_b =ImageMoELayer (hidden_size ,hidden_size *4 ,num_experts )

	self .adaLN_a =nn .Sequential (
	nn .SiLU (),
	nn .Linear (hidden_size ,hidden_size *6 ),
	)
	self .adaLN_b =nn .Sequential (
	nn .SiLU (),
	nn .Linear (hidden_size ,hidden_size *6 ),
	)

	self .norm1_a =nn .LayerNorm (hidden_size ,elementwise_affine =False )
	self .norm1_b =nn .LayerNorm (hidden_size ,elementwise_affine =False )
	self .norm2_a =nn .LayerNorm (hidden_size ,elementwise_affine =False )
	self .norm2_b =nn .LayerNorm (hidden_size ,elementwise_affine =False )

	def forward (self ,x_a :torch .Tensor ,x_b :torch .Tensor ,context :torch .Tensor ,t_emb :torch .Tensor ,height :int ,width :int )->Tuple [torch .Tensor ,torch .Tensor ]:
	shift_a ,scale_a ,gate_a ,shift2_a ,scale2_a ,gate2_a =self .adaLN_a (t_emb ).chunk (6 ,dim =-1 )
	shift_b ,scale_b ,gate_b ,shift2_b ,scale2_b ,gate2_b =self .adaLN_b (t_emb ).chunk (6 ,dim =-1 )

	shift_a =shift_a .unsqueeze (1 )
	scale_a =scale_a .unsqueeze (1 )
	gate_a =gate_a .unsqueeze (1 )
	shift2_a =shift2_a .unsqueeze (1 )
	scale2_a =scale2_a .unsqueeze (1 )
	gate2_a =gate2_a .unsqueeze (1 )

	shift_b =shift_b .unsqueeze (1 )
	scale_b =scale_b .unsqueeze (1 )
	gate_b =gate_b .unsqueeze (1 )
	shift2_b =shift2_b .unsqueeze (1 )
	scale2_b =scale2_b .unsqueeze (1 )
	gate2_b =gate2_b .unsqueeze (1 )

	x_a_norm =self .norm1_a (x_a )*(1 +scale_a )+shift_a
	x_b_norm =self .norm1_b (x_b )*(1 +scale_b )+shift_b

	attn_out_a ,attn_out_b =self .dual_attn (x_a_norm ,x_b_norm ,height ,width )
	x_a =x_a +gate_a *attn_out_a
	x_b =x_b +gate_b *attn_out_b

	x_a =x_a +self .cross_attn_a (x_a ,context )
	x_b =x_b +self .cross_attn_b (x_b ,context )

	x_a_norm =self .norm2_a (x_a )*(1 +scale2_a )+shift2_a
	x_b_norm =self .norm2_b (x_b )*(1 +scale2_b )+shift2_b

	x_a =x_a +gate2_a *self .moe_a (x_a_norm )
	x_b =x_b +gate2_b *self .moe_b (x_b_norm )

	return x_a ,x_b


	class FlowMatchingScheduler :
	"""Flow Matching scheduler for image generation."""

	def __init__ (self ,num_steps :int =50 ,sigma_min :float =0.002 ):
	self .num_steps =num_steps
	self .sigma_min =sigma_min
	self .timesteps =torch .linspace (1 ,0 ,num_steps +1 )

	def get_velocity (self ,x_t :torch .Tensor ,x_0 :torch .Tensor ,t :torch .Tensor )->torch .Tensor :
	return x_0 -x_t

	def step (self ,model_output :torch .Tensor ,t :torch .Tensor ,t_prev :torch .Tensor ,x_t :torch .Tensor )->torch .Tensor :
	dt =t -t_prev
	x_prev =x_t +model_output *dt .view (-1 ,1 ,1 ,1 )
	return x_prev

	def add_noise (self ,x_0 :torch .Tensor ,t :torch .Tensor )->torch .Tensor :
	noise =torch .randn_like (x_0 )

	t =t .to (x_0 .dtype ).view (-1 ,1 ,1 ,1 )
	x_t =t noise +(1 -t )x_0
	return x_t


	class PatchEmbed (nn .Module ):
	"""Patch embedding for DiT."""

	def __init__ (self ,patch_size :int =2 ,in_channels :int =4 ,hidden_size :int =512 ):
	super ().__init__ ()
	self .patch_size =patch_size
	self .proj =nn .Conv2d (in_channels ,hidden_size ,kernel_size =patch_size ,stride =patch_size )

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	x =self .proj (x )
	x =x .flatten (2 ).transpose (1 ,2 )
	return x


	class UnpatchEmbed (nn .Module ):
	"""Unpatch embedding to reconstruct image from patches."""

	def __init__ (self ,patch_size :int =2 ,out_channels :int =4 ,hidden_size :int =512 ):
	super ().__init__ ()
	self .patch_size =patch_size
	self .out_channels =out_channels
	self .proj =nn .Linear (hidden_size ,patch_size patch_size out_channels )

	def forward (self ,x :torch .Tensor ,height :int ,width :int )->torch .Tensor :
	x =self .proj (x )
	batch_size =x .shape [0 ]
	x =x .reshape (batch_size ,height ,width ,self .patch_size ,self .patch_size ,self .out_channels )
	x =x .permute (0 ,5 ,1 ,3 ,2 ,4 ).reshape (batch_size ,self .out_channels ,height self .patch_size ,width self .patch_size )
	return x


	class MoEDiT (nn .Module ):
	"""
	MoE Diffusion Transformer with Dual-Stream Attention.
	"""

	def __init__ (
	self ,
	in_channels :int =4 ,
	out_channels :int =4 ,
	hidden_size :int =512 ,
	context_dim :int =1024 ,
	num_layers :int =8 ,
	num_heads :int =8 ,
	num_experts :int =4 ,
	patch_size :int =2 ,
	max_image_size :int =64 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .patch_size =patch_size
	max_patches =max_image_size //patch_size

	self .time_embed =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size *4 ),
	nn .SiLU (),
	nn .Linear (hidden_size *4 ,hidden_size ),
	)

	self .patch_embed =PatchEmbed (patch_size ,in_channels ,hidden_size )
	self .context_proj =nn .Linear (context_dim ,hidden_size )

	self .blocks =nn .ModuleList ([
	DiTBlock (hidden_size ,hidden_size ,num_heads ,num_experts ,max_patches ,max_patches )
	for _ in range (num_layers )
	])

	self .final_norm =nn .LayerNorm (hidden_size )
	self .unpatch_embed =UnpatchEmbed (patch_size ,out_channels ,hidden_size )

	self .gradient_checkpointing =False

	self ._init_weights ()

	def _init_weights (self ):
	nn .init .zeros_ (self .unpatch_embed .proj .weight )
	nn .init .zeros_ (self .unpatch_embed .proj .bias )

	def enable_gradient_checkpointing (self ):
	"""Enable gradient checkpointing for memory efficiency."""
	self .gradient_checkpointing =True

	def forward (self ,x :torch .Tensor ,timesteps :torch .Tensor ,context :torch .Tensor ,mask :Optional [torch .Tensor ]=None )->torch .Tensor :
	batch_size ,channels ,height ,width =x .shape
	patch_height =height //self .patch_size
	patch_width =width //self .patch_size

	half_dim =self .hidden_size //2
	t_emb =math .log (10000 )/(half_dim -1 )
	t_emb =torch .exp (torch .arange (half_dim ,device =x .device ,dtype =x .dtype )*-t_emb )
	t_emb =timesteps [:,None ].to (x .dtype )*t_emb [None ,:]
	t_emb =torch .cat ([torch .sin (t_emb ),torch .cos (t_emb )],dim =-1 )
	t_emb =self .time_embed (t_emb )

	x_patches =self .patch_embed (x )

	context_proj =self .context_proj (context )

	x_a =x_patches
	x_b =x_patches .clone ()

	for block in self .blocks :
	if self .gradient_checkpointing and self .training :
	x_a ,x_b =torch .utils .checkpoint .checkpoint (
	block ,x_a ,x_b ,context_proj ,t_emb ,patch_height ,patch_width ,
	use_reentrant =False
	)
	else :
	x_a ,x_b =block (x_a ,x_b ,context_proj ,t_emb ,patch_height ,patch_width )

	x_combined =(x_a +x_b )/2
	x_combined =self .final_norm (x_combined )

	velocity =self .unpatch_embed (x_combined ,patch_height ,patch_width )

	return velocity


	class ImageVAE (nn .Module ):
	"""Lightweight VAE for image encoding/decoding."""

	def __init__ (self ,in_channels :int =3 ,latent_channels :int =4 ,base_channels :int =64 ):
	super ().__init__ ()

	self .encoder =nn .Sequential (
	nn .Conv2d (in_channels ,base_channels ,3 ,padding =1 ),
	nn .SiLU (),
	nn .Conv2d (base_channels ,base_channels *2 ,3 ,stride =2 ,padding =1 ),
	nn .SiLU (),
	nn .Conv2d (base_channels 2 ,base_channels 4 ,3 ,stride =2 ,padding =1 ),
	nn .SiLU (),
	nn .Conv2d (base_channels 4 ,latent_channels 2 ,3 ,padding =1 ),
	)

	self .decoder =nn .Sequential (
	nn .Conv2d (latent_channels ,base_channels *4 ,3 ,padding =1 ),
	nn .SiLU (),
	nn .Upsample (scale_factor =2 ,mode ='bilinear',align_corners =False ),
	nn .Conv2d (base_channels 4 ,base_channels 2 ,3 ,padding =1 ),
	nn .SiLU (),
	nn .Upsample (scale_factor =2 ,mode ='bilinear',align_corners =False ),
	nn .Conv2d (base_channels *2 ,base_channels ,3 ,padding =1 ),
	nn .SiLU (),
	nn .Conv2d (base_channels ,in_channels ,3 ,padding =1 ),
	)

	def encode (self ,x :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ,torch .Tensor ]:
	h =self .encoder (x )
	mean ,logvar =h .chunk (2 ,dim =1 )
	logvar =torch .clamp (logvar ,-30 ,20 )
	std =torch .exp (0.5 *logvar )
	z =mean +std *torch .randn_like (std )
	return z ,mean ,logvar

	def decode (self ,z :torch .Tensor )->torch .Tensor :
	return self .decoder (z )


	class MobileDiffusionGenerator (nn .Module ):
	"""
	SOTA Image Diffusion with MoE-DiT, Flow Matching, 2D-RoPE, Dual-Stream.
	Optimized for 2x T4 GPUs with FP16.
	"""

	def __init__ (
	self ,
	latent_channels :int =4 ,
	base_channels :int =128 ,
	context_dim :int =1024 ,
	num_inference_steps :int =50 ,
	image_size :int =256 ,
	cfg_scale :float =7.5 ,
	):
	super ().__init__ ()
	self .latent_channels =latent_channels
	self .context_dim =context_dim
	self .image_size =image_size
	self .latent_size =image_size //4
	self .num_inference_steps =num_inference_steps
	self .cfg_scale =cfg_scale

	self .vae_encoder =ImageVAE (3 ,latent_channels ,base_channels //2 )
	self .vae_decoder =self .vae_encoder

	self .unet =MoEDiT (
	in_channels =latent_channels ,
	out_channels =latent_channels ,
	hidden_size =base_channels *4 ,
	context_dim =context_dim ,
	num_layers =8 ,
	num_heads =8 ,
	num_experts =4 ,
	patch_size =2 ,
	max_image_size =self .latent_size ,
	)

	self .scheduler =FlowMatchingScheduler (num_inference_steps )

	def encode (self ,x :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ,torch .Tensor ]:
	return self .vae_encoder .encode (x )

	def decode (self ,z :torch .Tensor )->torch .Tensor :
	return self .vae_decoder .decode (z )

	def training_step (self ,images :torch .Tensor ,context :torch .Tensor ,mask :Optional [torch .Tensor ]=None )->dict :
	device =images .device
	dtype =images .dtype
	batch_size =images .shape [0 ]

	z ,mean ,logvar =self .encode (images *2 -1 )
	del images


	t =torch .rand (batch_size ,device =device ,dtype =dtype )

	x_t =self .scheduler .add_noise (z ,t )
	target_velocity =self .scheduler .get_velocity (x_t ,z ,t )


	if self .training :
	drop_mask =torch .rand (batch_size ,device =device )<0.1


	drop_mask_expanded =drop_mask .view (batch_size ,1 ,1 ).expand_as (context )
	null_ctx =torch .zeros_like (context )
	context =torch .where (drop_mask_expanded ,null_ctx ,context )
	del drop_mask ,drop_mask_expanded ,null_ctx

	pred_velocity =self .unet (x_t ,(t *1000 ).to (dtype ),context ,mask )
	del x_t ,context

	flow_loss =F .mse_loss (pred_velocity ,target_velocity )
	del pred_velocity ,target_velocity

	kl_loss =-0.5 *torch .mean (1 +logvar -mean .pow (2 )-logvar .exp ())
	del z ,mean ,logvar

	total_loss =flow_loss +0.0001 *kl_loss

	return {
	'flow_loss':flow_loss ,
	'kl_loss':kl_loss ,
	'total_loss':total_loss ,
	}

	@torch .no_grad ()
	def generate (self ,context :torch .Tensor ,guidance_scale :float =None ,num_steps :int =None ,init_latents :Optional [torch .Tensor ]=None ,mask :Optional [torch .Tensor ]=None ,masked_image_latents :Optional [torch .Tensor ]=None )->torch .Tensor :
	device =context .device
	batch_size =context .shape [0 ]
	seq_len =context .shape [1 ]
	guidance_scale =guidance_scale or self .cfg_scale
	num_steps =num_steps or self .num_inference_steps

	if init_latents is not None :
	latents =init_latents
	else :
	latents =torch .randn (batch_size ,self .latent_channels ,self .latent_size ,self .latent_size ,device =device )

	timesteps =torch .linspace (1 ,0 ,num_steps +1 ,device =device )

	if guidance_scale >1.0 :
	null_ctx =torch .zeros (batch_size ,seq_len ,self .context_dim ,device =device ,dtype =context .dtype )
	context =torch .cat ([null_ctx ,context ])

	for i in range (num_steps ):
	t =timesteps [i ]
	t_prev =timesteps [i +1 ]
	t_batch =t .expand (batch_size )*1000

	if guidance_scale >1.0 :
	latent_input =torch .cat ([latents ,latents ])
	t_input =torch .cat ([t_batch ,t_batch ])
	velocity_pred =self .unet (latent_input ,t_input ,context ,mask )
	velocity_uncond ,velocity_cond =velocity_pred .chunk (2 )
	velocity_pred =velocity_uncond +guidance_scale *(velocity_cond -velocity_uncond )
	else :
	velocity_pred =self .unet (latents ,t_batch ,context ,mask )

	latents =self .scheduler .step (velocity_pred ,t ,t_prev ,latents )

	if mask is not None and masked_image_latents is not None :
	latents =masked_image_latents mask +latents (1 -mask )

	images =self .decode (latents )
	images =(images +1 )/2
	return torch .clamp (images ,0 ,1 )

	@torch .no_grad ()
	def edit_image (self ,image :torch .Tensor ,context :torch .Tensor ,mask :torch .Tensor ,strength :float =0.8 ,guidance_scale :float =None )->torch .Tensor :
	device =image .device

	image_norm =image *2 -1
	z ,_ ,_ =self .encode (image_norm )

	mask_latent =F .interpolate (mask ,size =(self .latent_size ,self .latent_size ),mode ='nearest')

	num_steps =int (self .num_inference_steps *strength )

	t =torch .tensor ([strength ],device =device )
	noisy_z =self .scheduler .add_noise (z ,t .expand (z .shape [0 ]))

	return self .generate (
	context ,
	guidance_scale =guidance_scale ,
	num_steps =num_steps ,
	init_latents =noisy_z ,
	mask =mask_latent ,
	masked_image_latents =z ,
	)


	==============================================================================
	MODELS.GENERATORS.VIDEO
	==============================================================================

	EPS =1e-5


	class InterleavedMRoPE (nn .Module ):
	"""
	Interleaved Multi-dimensional Rotary Position Embedding (MRoPE).

	SOTA: Full-frequency allocation over time, width, and height via robust positional embeddings.
	Unlike separate spatial and temporal RoPE, Interleaved-MRoPE allocates frequencies across
	all three dimensions jointly, enhancing long-horizon video reasoning.

	Key advantages:
	- Better temporal-spatial correlation modeling
	- More robust for variable aspect ratios and frame counts
	- Improved long-range video understanding
	"""

	def __init__ (self ,dim :int ,max_height :int =64 ,max_width :int =64 ,max_frames :int =64 ,base :float =10000.0 ):
	super ().__init__ ()
	self .dim =dim
	self .max_height =max_height
	self .max_width =max_width
	self .max_frames =max_frames
	self .base =base


	self .dim_t =dim //3
	self .dim_y =dim //3
	self .dim_x =dim -self .dim_t -self .dim_y


	inv_freq_t =1.0 /(base **(torch .arange (0 ,self .dim_t ,2 ,dtype =torch .float32 )/self .dim_t ))
	inv_freq_y =1.0 /(base **(torch .arange (0 ,self .dim_y ,2 ,dtype =torch .float32 )/self .dim_y ))
	inv_freq_x =1.0 /(base **(torch .arange (0 ,self .dim_x ,2 ,dtype =torch .float32 )/self .dim_x ))

	self .register_buffer ('inv_freq_t',inv_freq_t ,persistent =False )
	self .register_buffer ('inv_freq_y',inv_freq_y ,persistent =False )
	self .register_buffer ('inv_freq_x',inv_freq_x ,persistent =False )

	def forward (self ,x :torch .Tensor ,height :int ,width :int ,num_frames :int )->Tuple [torch .Tensor ,torch .Tensor ]:
	"""
	Compute interleaved 3D positional embeddings.

	Args:
	x: Input tensor for device/dtype reference
	height: Spatial height
	width: Spatial width
	num_frames: Temporal frames

	Returns:
	cos, sin: [T * H * W, dim] positional embeddings
	"""
	device =x .device
	dtype =x .dtype


	pos_t =torch .arange (num_frames ,device =device ,dtype =torch .float32 )
	pos_y =torch .arange (height ,device =device ,dtype =torch .float32 )
	pos_x =torch .arange (width ,device =device ,dtype =torch .float32 )


	freqs_t =torch .outer (pos_t ,self .inv_freq_t .to (device ))
	freqs_y =torch .outer (pos_y ,self .inv_freq_y .to (device ))
	freqs_x =torch .outer (pos_x ,self .inv_freq_x .to (device ))


	freqs_t =torch .cat ([freqs_t ,freqs_t ],dim =-1 )
	freqs_y =torch .cat ([freqs_y ,freqs_y ],dim =-1 )
	freqs_x =torch .cat ([freqs_x ,freqs_x ],dim =-1 )


	seq_len =num_frames height width
	cos_3d =torch .zeros (num_frames ,height ,width ,self .dim ,device =device ,dtype =dtype )
	sin_3d =torch .zeros (num_frames ,height ,width ,self .dim ,device =device ,dtype =dtype )


	for t in range (num_frames ):
	for h in range (height ):
	for w in range (width ):

	cos_3d [t ,h ,w ,:self .dim_t ]=freqs_t [t ].cos ().to (dtype )
	sin_3d [t ,h ,w ,:self .dim_t ]=freqs_t [t ].sin ().to (dtype )

	cos_3d [t ,h ,w ,self .dim_t :self .dim_t +self .dim_y ]=freqs_y [h ].cos ().to (dtype )
	sin_3d [t ,h ,w ,self .dim_t :self .dim_t +self .dim_y ]=freqs_y [h ].sin ().to (dtype )

	cos_3d [t ,h ,w ,self .dim_t +self .dim_y :]=freqs_x [w ].cos ().to (dtype )
	sin_3d [t ,h ,w ,self .dim_t +self .dim_y :]=freqs_x [w ].sin ().to (dtype )


	cos_3d =cos_3d .view (seq_len ,self .dim )
	sin_3d =sin_3d .view (seq_len ,self .dim )

	return cos_3d ,sin_3d


	class RoPE2D (nn .Module ):
	"""
	2D Rotary Position Embedding for spatial dimensions (memory efficient).
	Used for spatial attention in factorized video attention.
	"""

	def __init__ (self ,dim :int ,max_height :int =64 ,max_width :int =64 ,base :float =10000.0 ):
	super ().__init__ ()
	self .dim =dim
	self .dim_x =dim //2
	self .dim_y =dim -self .dim_x

	inv_freq_x =1.0 /(base **(torch .arange (0 ,self .dim_x ,2 ,dtype =torch .float32 )/self .dim_x ))
	inv_freq_y =1.0 /(base **(torch .arange (0 ,self .dim_y ,2 ,dtype =torch .float32 )/self .dim_y ))

	self .register_buffer ('inv_freq_x',inv_freq_x ,persistent =False )
	self .register_buffer ('inv_freq_y',inv_freq_y ,persistent =False )

	def forward (self ,x :torch .Tensor ,height :int ,width :int )->Tuple [torch .Tensor ,torch .Tensor ]:
	device =x .device
	dtype =x .dtype


	pos_x =torch .arange (width ,device =device ,dtype =torch .float32 )
	pos_y =torch .arange (height ,device =device ,dtype =torch .float32 )

	freqs_x =torch .outer (pos_x ,self .inv_freq_x .to (device ))
	freqs_y =torch .outer (pos_y ,self .inv_freq_y .to (device ))


	cos_x =torch .cat ([freqs_x .cos (),freqs_x .cos ()],dim =-1 )
	sin_x =torch .cat ([freqs_x .sin (),freqs_x .sin ()],dim =-1 )
	cos_y =torch .cat ([freqs_y .cos (),freqs_y .cos ()],dim =-1 )
	sin_y =torch .cat ([freqs_y .sin (),freqs_y .sin ()],dim =-1 )


	cos_2d =torch .zeros (height ,width ,self .dim ,device =device ,dtype =dtype )
	sin_2d =torch .zeros (height ,width ,self .dim ,device =device ,dtype =dtype )

	cos_2d [:,:,:self .dim_x ]=cos_x .unsqueeze (0 ).expand (height ,-1 ,-1 )
	sin_2d [:,:,:self .dim_x ]=sin_x .unsqueeze (0 ).expand (height ,-1 ,-1 )
	cos_2d [:,:,self .dim_x :]=cos_y .unsqueeze (1 ).expand (-1 ,width ,-1 )
	sin_2d [:,:,self .dim_x :]=sin_y .unsqueeze (1 ).expand (-1 ,width ,-1 )

	return cos_2d .view (height width ,self .dim ).to (dtype ),sin_2d .view (height width ,self .dim ).to (dtype )


	class RoPE1D (nn .Module ):
	"""
	1D Rotary Position Embedding for temporal dimension.
	Used for temporal attention in factorized video attention.
	"""

	def __init__ (self ,dim :int ,max_len :int =64 ,base :float =10000.0 ):
	super ().__init__ ()
	self .dim =dim
	inv_freq =1.0 /(base **(torch .arange (0 ,dim ,2 ,dtype =torch .float32 )/dim ))
	self .register_buffer ('inv_freq',inv_freq ,persistent =False )

	def forward (self ,x :torch .Tensor ,seq_len :int )->Tuple [torch .Tensor ,torch .Tensor ]:
	device =x .device
	dtype =x .dtype

	pos =torch .arange (seq_len ,device =device ,dtype =torch .float32 )
	freqs =torch .outer (pos ,self .inv_freq .to (device ))
	freqs =torch .cat ([freqs ,freqs ],dim =-1 )

	return freqs .cos ().to (dtype ),freqs .sin ().to (dtype )


	def apply_rope (x :torch .Tensor ,cos :torch .Tensor ,sin :torch .Tensor )->torch .Tensor :
	"""Apply rotary position embedding."""
	x1 =x [...,:x .shape [-1 ]//2 ]
	x2 =x [...,x .shape [-1 ]//2 :]
	rotated =torch .cat ((-x2 ,x1 ),dim =-1 )
	return x cos +rotated sin


	class TemporalExpertRouter (nn .Module ):
	"""
	Temporal-Aware Expert Router for video generation.
	Routes tokens based on temporal context and motion patterns.
	"""

	def __init__ (self ,hidden_size :int ,num_experts :int =4 ,top_k :int =2 ):
	super ().__init__ ()
	self .num_experts =num_experts
	self .top_k =top_k

	self .temporal_proj =nn .Linear (hidden_size ,hidden_size )
	self .gate =nn .Linear (hidden_size ,num_experts ,bias =False )
	nn .init .normal_ (self .gate .weight ,mean =0.0 ,std =0.01 )

	def forward (self ,x :torch .Tensor ,temporal_context :Optional [torch .Tensor ]=None )->Tuple [torch .Tensor ,torch .Tensor ]:
	if temporal_context is not None :
	x =x +self .temporal_proj (temporal_context )

	router_logits =self .gate (x )
	router_probs =F .softmax (router_logits ,dim =-1 ,dtype =x .dtype )

	top_k_probs ,top_k_indices =torch .topk (router_probs ,self .top_k ,dim =-1 )
	top_k_probs =top_k_probs /(top_k_probs .sum (dim =-1 ,keepdim =True )+EPS )

	return top_k_probs ,top_k_indices


	class VideoExpert (nn .Module ):
	"""Single expert for video processing with SwiGLU."""

	def __init__ (self ,hidden_size :int ,intermediate_size :int ):
	super ().__init__ ()
	self .gate_proj =nn .Linear (hidden_size ,intermediate_size ,bias =False )
	self .up_proj =nn .Linear (hidden_size ,intermediate_size ,bias =False )
	self .down_proj =nn .Linear (intermediate_size ,hidden_size ,bias =False )
	self .act_fn =nn .SiLU ()

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	return self .down_proj (self .act_fn (self .gate_proj (x ))*self .up_proj (x ))


	class TemporalMoELayer (nn .Module ):
	"""
	Temporal-Aware MoE Layer for video generation.
	Uses motion-aware routing for expert selection.
	"""

	def __init__ (self ,hidden_size :int ,intermediate_size :int ,num_experts :int =4 ,top_k :int =2 ):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_experts =num_experts
	self .top_k =top_k

	self .router =TemporalExpertRouter (hidden_size ,num_experts ,top_k )
	self .experts =nn .ModuleList ([
	VideoExpert (hidden_size ,intermediate_size )
	for _ in range (num_experts )
	])
	self .shared_expert =VideoExpert (hidden_size ,intermediate_size )

	def forward (self ,x :torch .Tensor ,temporal_context :Optional [torch .Tensor ]=None )->torch .Tensor :
	batch_size ,seq_len ,hidden_size =x .shape
	x_flat =x .view (-1 ,hidden_size )

	top_k_probs ,top_k_indices =self .router (x_flat ,temporal_context .view (-1 ,hidden_size )if temporal_context is not None else None )

	output =torch .zeros_like (x_flat )

	for expert_idx in range (self .num_experts ):
	expert =self .experts [expert_idx ]
	for k in range (self .top_k ):
	mask =(top_k_indices [:,k ]==expert_idx )
	if mask .any ():
	expert_input =x_flat [mask ]
	expert_output =expert (expert_input )
	weight =top_k_probs [mask ,k :k +1 ]
	output [mask ]=output [mask ]+weight *expert_output

	shared_output =self .shared_expert (x_flat )
	output =output +shared_output

	return output .view (batch_size ,seq_len ,hidden_size )


	class SpatialAttention (nn .Module ):
	"""
	Spatial self-attention: each frame attends only within itself.
	Memory: O(T * (HW)^2) instead of O((TH*W)^2)
	"""

	def __init__ (self ,hidden_size :int ,num_heads :int =8 ,max_height :int =64 ,max_width :int =64 ):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_heads =num_heads
	self .head_dim =hidden_size //num_heads
	self .scale =self .head_dim **-0.5

	self .to_qkv =nn .Linear (hidden_size ,hidden_size *3 ,bias =False )
	self .to_out =nn .Linear (hidden_size ,hidden_size ,bias =False )
	self .rope_2d =RoPE2D (self .head_dim ,max_height ,max_width )
	self .norm =nn .LayerNorm (hidden_size )

	def forward (self ,x :torch .Tensor ,height :int ,width :int ,frames :int )->torch .Tensor :
	batch_size ,seq_len ,_ =x .shape
	spatial_len =height *width

	x =self .norm (x )


	x =x .view (batch_size *frames ,spatial_len ,self .hidden_size )

	qkv =self .to_qkv (x ).reshape (batch_size *frames ,spatial_len ,3 ,self .num_heads ,self .head_dim )
	q ,k ,v =qkv .unbind (dim =2 )


	cos ,sin =self .rope_2d (x ,height ,width )
	cos =cos .unsqueeze (0 ).unsqueeze (1 )
	sin =sin .unsqueeze (0 ).unsqueeze (1 )

	q =q .transpose (1 ,2 )
	k =k .transpose (1 ,2 )
	v =v .transpose (1 ,2 )

	q =apply_rope (q ,cos ,sin )
	k =apply_rope (k ,cos ,sin )


	qk_scale =self .head_dim **-0.25
	out =F .scaled_dot_product_attention (
	q qk_scale ,k qk_scale ,v ,
	is_causal =False ,scale =1.0 ,
	)
	out =out .transpose (1 ,2 ).reshape (batch_size *frames ,spatial_len ,self .hidden_size )
	out =self .to_out (out )


	return out .view (batch_size ,seq_len ,self .hidden_size )


	class TemporalAttention (nn .Module ):
	"""
	Temporal self-attention: each spatial position attends across time.
	Memory: O(HW T^2) instead of O((THW)^2)
	"""

	def __init__ (self ,hidden_size :int ,num_heads :int =8 ,max_frames :int =32 ):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_heads =num_heads
	self .head_dim =hidden_size //num_heads
	self .scale =self .head_dim **-0.5

	self .to_qkv =nn .Linear (hidden_size ,hidden_size *3 ,bias =False )
	self .to_out =nn .Linear (hidden_size ,hidden_size ,bias =False )
	self .rope_1d =RoPE1D (self .head_dim ,max_frames )
	self .norm =nn .LayerNorm (hidden_size )

	def forward (self ,x :torch .Tensor ,height :int ,width :int ,frames :int ,causal :bool =True )->torch .Tensor :
	batch_size ,seq_len ,_ =x .shape
	spatial_len =height *width

	x =self .norm (x )


	x =x .view (batch_size ,frames ,spatial_len ,self .hidden_size )
	x =x .permute (0 ,2 ,1 ,3 ).reshape (batch_size *spatial_len ,frames ,self .hidden_size )

	qkv =self .to_qkv (x ).reshape (batch_size *spatial_len ,frames ,3 ,self .num_heads ,self .head_dim )
	q ,k ,v =qkv .unbind (dim =2 )


	cos ,sin =self .rope_1d (x ,frames )
	cos =cos .unsqueeze (0 ).unsqueeze (1 )
	sin =sin .unsqueeze (0 ).unsqueeze (1 )

	q =q .transpose (1 ,2 )
	k =k .transpose (1 ,2 )
	v =v .transpose (1 ,2 )

	q =apply_rope (q ,cos ,sin )
	k =apply_rope (k ,cos ,sin )


	qk_scale =self .head_dim **-0.25
	out =F .scaled_dot_product_attention (
	q qk_scale ,k qk_scale ,v ,
	is_causal =causal ,scale =1.0 ,
	)
	out =out .transpose (1 ,2 ).reshape (batch_size *spatial_len ,frames ,self .hidden_size )


	out =out .view (batch_size ,spatial_len ,frames ,self .hidden_size )
	out =out .permute (0 ,2 ,1 ,3 ).reshape (batch_size ,seq_len ,self .hidden_size )
	out =self .to_out (out )

	return out


	class FactorizedSpatioTemporalAttention (nn .Module ):
	"""
	Factorized Spatial-Temporal Attention (like CogVideo, Open-Sora, SVD).

	Instead of full 3D attention O((THW)^2), uses:
	1. Spatial attention per frame: O(T * (H*W)^2)
	2. Temporal attention per position: O(HW T^2)

	Total: O(T(HW)^2 + HWT^2) << O((THW)^2)

	For T=8, H=W=64:
	- Full 3D: 32768^2 = 1B attention scores
	- Factorized: 84096^2 + 409664 = 134M attention scores (7.5x less!)
	"""

	def __init__ (self ,hidden_size :int ,num_heads :int =8 ,max_frames :int =32 ,max_height :int =64 ,max_width :int =64 ):
	super ().__init__ ()
	self .spatial_attn =SpatialAttention (hidden_size ,num_heads ,max_height ,max_width )
	self .temporal_attn =TemporalAttention (hidden_size ,num_heads ,max_frames )

	def forward (self ,x :torch .Tensor ,height :int ,width :int ,frames :int ,causal :bool =True )->torch .Tensor :

	x =x +self .spatial_attn (x ,height ,width ,frames )

	x =x +self .temporal_attn (x ,height ,width ,frames ,causal )
	return x


	class CrossAttention3D (nn .Module ):
	"""Cross-attention for text-to-video conditioning."""

	def __init__ (self ,query_dim :int ,context_dim :int =None ,heads :int =8 ):
	super ().__init__ ()
	self .heads =heads
	context_dim =context_dim or query_dim
	self .head_dim =query_dim //heads
	self .scale =self .head_dim **-0.5

	self .norm =nn .LayerNorm (query_dim )
	self .to_q =nn .Linear (query_dim ,query_dim ,bias =False )
	self .to_k =nn .Linear (context_dim ,query_dim ,bias =False )
	self .to_v =nn .Linear (context_dim ,query_dim ,bias =False )
	self .to_out =nn .Linear (query_dim ,query_dim ,bias =False )

	def forward (self ,x :torch .Tensor ,context :torch .Tensor )->torch .Tensor :
	batch_size ,seq_len ,_ =x .shape
	ctx_len =context .shape [1 ]

	x =self .norm (x )

	q =self .to_q (x ).reshape (batch_size ,seq_len ,self .heads ,self .head_dim ).transpose (1 ,2 )
	k =self .to_k (context ).reshape (batch_size ,ctx_len ,self .heads ,self .head_dim ).transpose (1 ,2 )
	v =self .to_v (context ).reshape (batch_size ,ctx_len ,self .heads ,self .head_dim ).transpose (1 ,2 )


	qk_scale =self .head_dim **-0.25
	out =F .scaled_dot_product_attention (
	q qk_scale ,k qk_scale ,v ,
	is_causal =False ,scale =1.0 ,
	)
	out =out .transpose (1 ,2 ).reshape (batch_size ,seq_len ,-1 )
	out =self .to_out (out )

	return out


	class Causal3DTransformerBlock (nn .Module ):
	"""
	3D Causal Transformer Block with Factorized Spatial-Temporal Attention.

	Uses memory-efficient factorized attention instead of full 3D attention:
	- Spatial: Each frame attends within itself O(T * (H*W)^2)
	- Temporal: Each position attends across frames O(HW T^2)

	This reduces memory from O((THW)^2) to O(T(HW)^2 + HWT^2)
	"""

	def __init__ (self ,hidden_size :int ,context_dim :int ,num_heads :int =8 ,num_experts :int =4 ,max_frames :int =32 ,max_height :int =64 ,max_width :int =64 ):
	super ().__init__ ()


	self .self_attn =FactorizedSpatioTemporalAttention (hidden_size ,num_heads ,max_frames ,max_height ,max_width )
	self .cross_attn =CrossAttention3D (hidden_size ,context_dim ,num_heads )
	self .moe =TemporalMoELayer (hidden_size ,hidden_size *4 ,num_experts )

	self .norm1 =nn .LayerNorm (hidden_size )
	self .norm2 =nn .LayerNorm (hidden_size )
	self .norm3 =nn .LayerNorm (hidden_size )

	def forward (self ,x :torch .Tensor ,context :torch .Tensor ,height :int ,width :int ,frames :int ,temporal_context :Optional [torch .Tensor ]=None )->torch .Tensor :

	x =self .self_attn (self .norm1 (x ),height ,width ,frames ,causal =True )

	x =x +self .cross_attn (self .norm2 (x ),context )

	x =x +self .moe (self .norm3 (x ),temporal_context )

	return x


	class FlowMatchingScheduler :
	"""
	Flow Matching scheduler for video generation.
	Uses optimal transport paths for superior generation quality.
	"""

	def __init__ (self ,num_steps :int =50 ,sigma_min :float =0.002 ):
	self .num_steps =num_steps
	self .sigma_min =sigma_min

	self .timesteps =torch .linspace (1 ,0 ,num_steps +1 )

	def get_velocity (self ,x_t :torch .Tensor ,x_0 :torch .Tensor ,t :torch .Tensor )->torch .Tensor :
	"""Compute target velocity for flow matching."""
	return x_0 -x_t

	def step (self ,model_output :torch .Tensor ,t :torch .Tensor ,t_prev :torch .Tensor ,x_t :torch .Tensor )->torch .Tensor :
	"""Single step of flow matching ODE."""
	dt =t -t_prev
	x_prev =x_t +model_output *dt .view (-1 ,1 ,1 ,1 ,1 )
	return x_prev

	def add_noise (self ,x_0 :torch .Tensor ,t :torch .Tensor )->torch .Tensor :
	"""Add noise for training (linear interpolation)."""
	noise =torch .randn_like (x_0 )

	t =t .to (x_0 .dtype ).view (-1 ,1 ,1 ,1 ,1 )
	x_t =t noise +(1 -t )x_0
	return x_t


	class VideoUNet3D (nn .Module ):
	"""
	3D U-Net for video generation with Factorized Spatial-Temporal Attention.

	Uses memory-efficient factorized attention that processes spatial and temporal
	dimensions separately, reducing memory from O((THW)^2) to O(T(HW)^2 + HWT^2).
	"""

	def __init__ (
	self ,
	in_channels :int =4 ,
	out_channels :int =4 ,
	hidden_size :int =512 ,
	context_dim :int =1024 ,
	num_layers :int =4 ,
	num_heads :int =8 ,
	num_experts :int =4 ,
	num_frames :int =16 ,
	max_height :int =64 ,
	max_width :int =64 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_frames =num_frames

	self .time_embed =nn .Sequential (
	nn .Linear (hidden_size ,hidden_size *4 ),
	nn .SiLU (),
	nn .Linear (hidden_size *4 ,hidden_size ),
	)

	self .input_proj =nn .Conv3d (in_channels ,hidden_size ,kernel_size =3 ,padding =1 )

	self .transformer_blocks =nn .ModuleList ([
	Causal3DTransformerBlock (hidden_size ,context_dim ,num_heads ,num_experts ,num_frames ,max_height ,max_width )
	for _ in range (num_layers )
	])

	self .output_proj =nn .Sequential (
	nn .GroupNorm (32 ,hidden_size ),
	nn .SiLU (),
	nn .Conv3d (hidden_size ,out_channels ,kernel_size =3 ,padding =1 ),
	)

	nn .init .zeros_ (self .output_proj [-1 ].weight )
	nn .init .zeros_ (self .output_proj [-1 ].bias )

	self .gradient_checkpointing =False

	def enable_gradient_checkpointing (self ):
	"""Enable gradient checkpointing for memory efficiency."""
	self .gradient_checkpointing =True

	def forward (self ,x :torch .Tensor ,timesteps :torch .Tensor ,context :torch .Tensor ,first_frame_latent :Optional [torch .Tensor ]=None )->torch .Tensor :
	batch_size ,channels ,frames ,height ,width =x .shape

	half_dim =self .hidden_size //2
	t_emb =math .log (10000 )/(half_dim -1 )
	t_emb =torch .exp (torch .arange (half_dim ,device =x .device ,dtype =x .dtype )*-t_emb )
	t_emb =timesteps [:,None ].to (x .dtype )*t_emb [None ,:]
	t_emb =torch .cat ([torch .sin (t_emb ),torch .cos (t_emb )],dim =-1 )
	t_emb =self .time_embed (t_emb )

	h =self .input_proj (x )

	h =h .permute (0 ,2 ,3 ,4 ,1 ).reshape (batch_size ,frames height width ,self .hidden_size )

	temporal_context =t_emb .unsqueeze (1 ).expand (-1 ,frames height width ,-1 )

	for block in self .transformer_blocks :
	if self .gradient_checkpointing and self .training :
	h =torch .utils .checkpoint .checkpoint (
	block ,h ,context ,height ,width ,frames ,temporal_context ,
	use_reentrant =False
	)
	else :
	h =block (h ,context ,height ,width ,frames ,temporal_context )

	h =h .reshape (batch_size ,frames ,height ,width ,self .hidden_size ).permute (0 ,4 ,1 ,2 ,3 )

	velocity =self .output_proj (h )

	return velocity


	class VideoVAE3D (nn .Module ):
	"""
	3D VAE for video encoding/decoding using VidTok architecture.

	This replaces the simple placeholder with proper temporal+spatial compression
	following Microsoft's VidTok architecture for high-quality video tokenization.

	Features:
	- Proper temporal compression (4x default)
	- Proper spatial compression (8x default, same as image VAE)
	- AlphaBlender for temporal blending
	- Causal mode support for streaming
	- Both KL (continuous) and FSQ (discrete) tokenization

	Compression: [B, C, T, H, W] -> [B, latent_ch, T/4, H/8, W/8]
	"""

	def __init__ (
	self ,
	in_channels :int =3 ,
	latent_channels :int =4 ,
	base_channels :int =64 ,
	temporal_compression :int =4 ,
	spatial_compression :int =8 ,
	causal :bool =True ,
	use_fsq :bool =False ,
	):
	super ().__init__ ()
	self .in_channels =in_channels
	self .latent_channels =latent_channels
	self .temporal_compression =temporal_compression
	self .spatial_compression =spatial_compression
	self .causal =causal
	self .use_fsq =use_fsq




	self .temporal_stages =int (math .log2 (temporal_compression ))
	self .spatial_stages =int (math .log2 (spatial_compression ))


	encoder_layers =[]
	ch_in =in_channels
	ch_out =base_channels


	encoder_layers .append (nn .Conv3d (ch_in ,ch_out ,3 ,padding =1 ))
	encoder_layers .append (nn .SiLU ())


	for i in range (self .spatial_stages -self .temporal_stages ):
	ch_in =ch_out
	ch_out =min (ch_out 2 ,base_channels 8 )
	encoder_layers .append (nn .Conv3d (ch_in ,ch_out ,3 ,stride =(1 ,2 ,2 ),padding =1 ))
	encoder_layers .append (nn .SiLU ())


	for i in range (self .temporal_stages ):
	ch_in =ch_out
	ch_out =min (ch_out 2 ,base_channels 8 )
	encoder_layers .append (nn .Conv3d (ch_in ,ch_out ,3 ,stride =(2 ,2 ,2 ),padding =1 ))
	encoder_layers .append (nn .SiLU ())


	out_ch =latent_channels *2 if not use_fsq else latent_channels
	encoder_layers .append (nn .Conv3d (ch_out ,out_ch ,3 ,padding =1 ))

	self .encoder =nn .Sequential (*encoder_layers )


	decoder_layers =[]
	ch_in =latent_channels
	ch_out =base_channels (2 *min (self .spatial_stages ,3 ))


	decoder_layers .append (nn .Conv3d (ch_in ,ch_out ,3 ,padding =1 ))
	decoder_layers .append (nn .SiLU ())


	for i in range (self .temporal_stages ):
	ch_in =ch_out
	ch_out =max (ch_out //2 ,base_channels )
	decoder_layers .append (nn .Upsample (scale_factor =(2 ,2 ,2 ),mode ='trilinear',align_corners =False ))
	decoder_layers .append (nn .Conv3d (ch_in ,ch_out ,3 ,padding =1 ))
	decoder_layers .append (nn .SiLU ())


	for i in range (self .spatial_stages -self .temporal_stages ):
	ch_in =ch_out
	ch_out =max (ch_out //2 ,base_channels )
	decoder_layers .append (nn .Upsample (scale_factor =(1 ,2 ,2 ),mode ='trilinear',align_corners =False ))
	decoder_layers .append (nn .Conv3d (ch_in ,ch_out ,3 ,padding =1 ))
	decoder_layers .append (nn .SiLU ())


	decoder_layers .append (nn .Conv3d (ch_out ,in_channels ,3 ,padding =1 ))

	self .decoder =nn .Sequential (*decoder_layers )

	print (f" 🎬 VideoVAE3D (VidTok): {temporal_compression }x{spatial_compression }x{spatial_compression } compression")
	print (f" Temporal stages: {self .temporal_stages }, Spatial stages: {self .spatial_stages }")
	print (f" Mode: {'FSQ (discrete)'if use_fsq else 'KL (continuous)'}, Causal: {causal }")

	def encode (self ,x :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ,torch .Tensor ]:
	"""
	Encode video to latent space.

	Args:
	x: [B, C, T, H, W] video tensor, values in [0, 1] or [-1, 1]

	Returns:
	Tuple of (z, mean, logvar) where z is the sampled latent
	"""
	h =self .encoder (x )

	if self .use_fsq :

	z =self ._fsq_quantize (h )
	return z ,z ,torch .zeros_like (z )
	else :

	mean ,logvar =h .chunk (2 ,dim =1 )
	logvar =torch .clamp (logvar ,-30 ,20 )
	std =torch .exp (0.5 *logvar )
	z =mean +std *torch .randn_like (std )
	return z ,mean ,logvar

	def _fsq_quantize (self ,z :torch .Tensor ,levels :int =8 )->torch .Tensor :
	"""Finite Scalar Quantization."""
	z =torch .tanh (z )
	z =torch .round ((z +1 )(levels -1 )/2 )2 /(levels -1 )-1
	return z

	def decode (self ,z :torch .Tensor )->torch .Tensor :
	"""
	Decode latent to video.

	Args:
	z: [B, latent_ch, t, h, w] latent tensor

	Returns:
	[B, C, T, H, W] reconstructed video
	"""
	return self .decoder (z )


	class MobileVideoDiffusion (nn .Module ):
	"""
	SOTA Video Diffusion with Flow Matching, Factorized Attention, Temporal MoE.

	Uses memory-efficient factorized spatial-temporal attention:
	- Full 3D attention: O((THW)^2) = 1B+ attention scores (OOM!)
	- Factorized: O(T(HW)^2 + HWT^2) = ~134M scores (7.5x less memory)

	Optimized for 2x T4 GPUs (15GB each) with FP16.
	"""

	def __init__ (
	self ,
	latent_channels :int =4 ,
	base_channels :int =64 ,
	context_dim :int =1024 ,
	num_frames :int =16 ,
	image_size :int =256 ,
	num_inference_steps :int =50 ,
	cfg_scale :float =7.5 ,
	temporal_compression :int =4 ,
	spatial_compression :int =8 ,
	causal :bool =True ,
	use_fsq :bool =False ,
	):
	super ().__init__ ()
	self .latent_channels =latent_channels
	self .context_dim =context_dim
	self .num_frames =num_frames
	self .image_size =image_size
	self .temporal_compression =temporal_compression
	self .spatial_compression =spatial_compression
	self .latent_size =image_size //spatial_compression
	self .latent_frames =num_frames //temporal_compression
	self .num_inference_steps =num_inference_steps
	self .cfg_scale =cfg_scale


	self .vae =VideoVAE3D (
	in_channels =3 ,
	latent_channels =latent_channels ,
	base_channels =base_channels ,
	temporal_compression =temporal_compression ,
	spatial_compression =spatial_compression ,
	causal =causal ,
	use_fsq =use_fsq ,
	)

	self .unet =VideoUNet3D (
	in_channels =latent_channels ,
	out_channels =latent_channels ,
	hidden_size =base_channels *4 ,
	context_dim =context_dim ,
	num_layers =4 ,
	num_heads =8 ,
	num_experts =4 ,
	num_frames =num_frames ,
	max_height =self .latent_size ,
	max_width =self .latent_size ,
	)

	self .scheduler =FlowMatchingScheduler (num_inference_steps )

	def encode_video (self ,video :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ,torch .Tensor ]:
	return self .vae .encode (video *2 -1 )

	def decode_video (self ,z :torch .Tensor )->torch .Tensor :
	return self .vae .decode (z )

	def encode_image (self ,image :torch .Tensor )->torch .Tensor :
	image_expanded =image .unsqueeze (2 )
	z ,_ ,_ =self .vae .encode (image_expanded )
	return z .squeeze (2 )

	def training_step (self ,video :torch .Tensor ,context :torch .Tensor ,first_frame :Optional [torch .Tensor ]=None )->dict :
	device =video .device
	dtype =video .dtype
	batch_size =video .shape [0 ]


	z ,mean ,logvar =self .encode_video (video )
	del video


	t =torch .rand (batch_size ,device =device ,dtype =dtype )

	x_t =self .scheduler .add_noise (z ,t )

	target_velocity =self .scheduler .get_velocity (x_t ,z ,t )


	if self .training :
	drop_mask =torch .rand (batch_size ,device =device )<0.1


	drop_mask_expanded =drop_mask .view (batch_size ,1 ,1 ).expand_as (context )
	null_ctx =torch .zeros_like (context )
	context =torch .where (drop_mask_expanded ,null_ctx ,context )
	del drop_mask ,drop_mask_expanded ,null_ctx

	pred_velocity =self .unet (x_t ,(t *1000 ).to (dtype ),context ,None )
	del x_t ,context

	flow_loss =F .mse_loss (pred_velocity ,target_velocity )
	del pred_velocity ,target_velocity

	kl_loss =-0.5 *torch .mean (1 +logvar -mean .pow (2 )-logvar .exp ())

	temporal_loss =torch .tensor (0.0 ,device =device ,dtype =dtype )
	if z .shape [2 ]>1 :
	z_diff =z [:,:,1 :]-z [:,:,:-1 ]
	temporal_loss =torch .mean (z_diff **2 )
	del z_diff
	del z ,mean ,logvar

	total_loss =flow_loss +0.0001 kl_loss +0.01 temporal_loss

	return {
	'flow_loss':flow_loss ,
	'kl_loss':kl_loss ,
	'temporal_loss':temporal_loss ,
	'total_loss':total_loss ,
	}

	@torch .no_grad ()
	def generate_t2v (self ,context :torch .Tensor ,num_frames :int =None ,guidance_scale :float =None ,num_steps :int =None )->torch .Tensor :
	device =context .device
	batch_size =context .shape [0 ]
	seq_len =context .shape [1 ]
	num_frames =num_frames or self .num_frames
	guidance_scale =guidance_scale or self .cfg_scale
	num_steps =num_steps or self .num_inference_steps

	latents =torch .randn (
	batch_size ,self .latent_channels ,num_frames ,
	self .latent_size ,self .latent_size ,device =device
	)

	timesteps =torch .linspace (1 ,0 ,num_steps +1 ,device =device )

	if guidance_scale >1.0 :
	null_ctx =torch .zeros (batch_size ,seq_len ,self .context_dim ,device =device ,dtype =context .dtype )
	context =torch .cat ([null_ctx ,context ])

	for i in range (num_steps ):
	t =timesteps [i ]
	t_prev =timesteps [i +1 ]
	t_batch =t .expand (batch_size )*1000

	if guidance_scale >1.0 :
	latent_input =torch .cat ([latents ,latents ])
	t_input =torch .cat ([t_batch ,t_batch ])
	velocity_pred =self .unet (latent_input ,t_input ,context ,None )
	velocity_uncond ,velocity_cond =velocity_pred .chunk (2 )
	velocity_pred =velocity_uncond +guidance_scale *(velocity_cond -velocity_uncond )
	else :
	velocity_pred =self .unet (latents ,t_batch ,context ,None )

	latents =self .scheduler .step (velocity_pred ,t ,t_prev ,latents )

	video =self .decode_video (latents )
	return torch .clamp ((video +1 )/2 ,0 ,1 )

	@torch .no_grad ()
	def generate_i2v (self ,first_frame :torch .Tensor ,context :Optional [torch .Tensor ]=None ,num_frames :int =None ,guidance_scale :float =None ,num_steps :int =None )->torch .Tensor :
	device =first_frame .device
	batch_size =first_frame .shape [0 ]
	num_frames =num_frames or self .num_frames
	guidance_scale =guidance_scale or self .cfg_scale
	num_steps =num_steps or self .num_inference_steps

	first_frame_latent =self .encode_image (first_frame *2 -1 )

	latents =torch .randn (
	batch_size ,self .latent_channels ,num_frames ,
	self .latent_size ,self .latent_size ,device =device
	)
	latents [:,:,0 ]=first_frame_latent

	if context is None :
	context =torch .zeros (batch_size ,77 ,self .context_dim ,device =device )

	seq_len =context .shape [1 ]
	timesteps =torch .linspace (1 ,0 ,num_steps +1 ,device =device )

	if guidance_scale >1.0 :
	null_ctx =torch .zeros (batch_size ,seq_len ,self .context_dim ,device =device ,dtype =context .dtype )
	context =torch .cat ([null_ctx ,context ])

	for i in range (num_steps ):
	t =timesteps [i ]
	t_prev =timesteps [i +1 ]
	t_batch =t .expand (batch_size )*1000

	if guidance_scale >1.0 :
	latent_input =torch .cat ([latents ,latents ])
	t_input =torch .cat ([t_batch ,t_batch ])
	velocity_pred =self .unet (latent_input ,t_input ,context ,None )
	velocity_uncond ,velocity_cond =velocity_pred .chunk (2 )
	velocity_pred =velocity_uncond +guidance_scale *(velocity_cond -velocity_uncond )
	else :
	velocity_pred =self .unet (latents ,t_batch ,context ,None )

	latents =self .scheduler .step (velocity_pred ,t ,t_prev ,latents )
	latents [:,:,0 ]=first_frame_latent

	video =self .decode_video (latents )
	return torch .clamp ((video +1 )/2 ,0 ,1 )


	==============================================================================
	MODELS.LLM.MOE_LLAMA
	==============================================================================

	EPS =1e-5


	class YaRNRotaryEmbedding (nn .Module ):
	"""
	YaRN (Yet another RoPE extensioN) with LongRoPE-style improvements.
	Supports up to 128K+ context with proper frequency scaling.
	"""

	def __init__ (
	self ,
	dim :int ,
	max_position_embeddings :int =131072 ,
	base :float =500000.0 ,
	original_max_position_embeddings :int =8192 ,
	beta_fast :float =32.0 ,
	beta_slow :float =1.0 ,
	mscale :float =1.0 ,
	):
	super ().__init__ ()
	self .dim =dim
	self .max_position_embeddings =max_position_embeddings
	self .base =base
	self .original_max_position =original_max_position_embeddings
	self .beta_fast =beta_fast
	self .beta_slow =beta_slow
	self .mscale =mscale

	self .scaling_factor =max_position_embeddings /original_max_position_embeddings

	inv_freq =self ._compute_yarn_inv_freq ()
	self .register_buffer ('inv_freq',inv_freq ,persistent =False )

	def _compute_yarn_inv_freq (self )->torch .Tensor :
	"""Compute YaRN-scaled inverse frequencies."""
	pos_freqs =self .base **(torch .arange (0 ,self .dim ,2 ,dtype =torch .float32 )/self .dim )
	inv_freq_extrapolation =1.0 /pos_freqs
	inv_freq_interpolation =1.0 /(self .scaling_factor *pos_freqs )

	low =max (math .floor (self .dim math .log (self .original_max_position /(self .beta_fast 2 *math .pi ))/
	(2 *math .log (self .base ))),0 )
	high =min (math .ceil (self .dim math .log (self .original_max_position /(self .beta_slow 2 *math .pi ))/
	(2 *math .log (self .base ))),self .dim -1 )

	inv_freq =torch .zeros (self .dim //2 ,dtype =torch .float32 )
	for i in range (self .dim //2 ):
	if i <low :
	inv_freq [i ]=inv_freq_interpolation [i ]
	elif i >high :
	inv_freq [i ]=inv_freq_extrapolation [i ]
	else :
	smooth =(i -low )/max (high -low ,1 )
	inv_freq [i ]=(1 -smooth )inv_freq_interpolation [i ]+smooth inv_freq_extrapolation [i ]

	return inv_freq

	def _get_mscale (self ,scale :float )->float :
	"""Get attention scaling factor for YaRN."""
	if scale <=1 :
	return 1.0
	return 0.1 *math .log (scale )+1.0

	def forward (self ,x :torch .Tensor ,position_ids :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ]:
	device =x .device
	inv_freq =self .inv_freq .to (device )

	inv_freq_expanded =inv_freq [None ,:,None ].float ().expand (position_ids .shape [0 ],-1 ,1 )
	position_ids_expanded =position_ids [:,None ,:].float ()

	freqs =(inv_freq_expanded @position_ids_expanded ).transpose (1 ,2 )
	emb =torch .cat ((freqs ,freqs ),dim =-1 )

	mscale =self ._get_mscale (self .scaling_factor )*self .mscale

	cos =emb .cos ().to (dtype =x .dtype )*mscale
	sin =emb .sin ().to (dtype =x .dtype )*mscale

	return cos ,sin


	LlamaRotaryEmbedding =YaRNRotaryEmbedding


	def rotate_half (x :torch .Tensor )->torch .Tensor :
	x1 =x [...,:x .shape [-1 ]//2 ]
	x2 =x [...,x .shape [-1 ]//2 :]
	return torch .cat ((-x2 ,x1 ),dim =-1 )


	def apply_rotary_pos_emb (
	q :torch .Tensor ,
	k :torch .Tensor ,
	cos :torch .Tensor ,
	sin :torch .Tensor ,
	position_ids :Optional [torch .Tensor ]=None ,
	unsqueeze_dim :int =1 ,
	)->Tuple [torch .Tensor ,torch .Tensor ]:
	cos =cos .unsqueeze (unsqueeze_dim )
	sin =sin .unsqueeze (unsqueeze_dim )
	q_embed =(q cos )+(rotate_half (q )sin )
	k_embed =(k cos )+(rotate_half (k )sin )
	return q_embed ,k_embed


	class KVCache :
	"""Pre-allocated KV Cache — static buffer with index-based filling.

	Eliminates VRAM fragmentation from torch.cat during autoregressive generation.
	Buffer is allocated once at first use and reused via slice assignment.
	"""

	__slots__ =('key_cache','value_cache','seen_tokens','_max_len')

	def __init__ (
	self ,
	key_cache :torch .Tensor =None ,
	value_cache :torch .Tensor =None ,
	seen_tokens :int =0 ,
	max_seq_len :int =131072 ,
	):
	self .key_cache =key_cache
	self .value_cache =value_cache
	self .seen_tokens =seen_tokens
	self ._max_len =max_seq_len

	def _allocate (self ,batch :int ,heads :int ,head_dim :int ,device :torch .device ,dtype :torch .dtype ):
	"""Allocate static buffer on first use."""
	self .key_cache =torch .zeros (batch ,heads ,self ._max_len ,head_dim ,device =device ,dtype =dtype )
	self .value_cache =torch .zeros (batch ,heads ,self ._max_len ,head_dim ,device =device ,dtype =dtype )

	def update (
	self ,
	key_states :torch .Tensor ,
	value_states :torch .Tensor ,
	chunk_size :Optional [int ]=None ,
	)->Tuple [torch .Tensor ,torch .Tensor ]:
	batch ,heads ,new_len ,head_dim =key_states .shape

	if self .key_cache is None :

	self ._allocate (batch ,heads ,head_dim ,key_states .device ,key_states .dtype )
	self .seen_tokens =0


	if chunk_size is not None and self .seen_tokens +new_len >chunk_size *2 :

	keep =chunk_size
	if self .seen_tokens >keep :
	self .key_cache [:,:,:keep ]=self .key_cache [:,:,self .seen_tokens -keep :self .seen_tokens ].clone ()
	self .value_cache [:,:,:keep ]=self .value_cache [:,:,self .seen_tokens -keep :self .seen_tokens ].clone ()
	self .seen_tokens =keep


	if self .seen_tokens +new_len >self .key_cache .shape [2 ]:
	new_max =max (self .key_cache .shape [2 ]*2 ,self .seen_tokens +new_len )
	new_key =torch .zeros (batch ,heads ,new_max ,head_dim ,device =key_states .device ,dtype =key_states .dtype )
	new_val =torch .zeros (batch ,heads ,new_max ,head_dim ,device =key_states .device ,dtype =key_states .dtype )
	new_key [:,:,:self .seen_tokens ]=self .key_cache [:,:,:self .seen_tokens ]
	new_val [:,:,:self .seen_tokens ]=self .value_cache [:,:,:self .seen_tokens ]
	self .key_cache =new_key
	self .value_cache =new_val


	self .key_cache [:,:,self .seen_tokens :self .seen_tokens +new_len ]=key_states
	self .value_cache [:,:,self .seen_tokens :self .seen_tokens +new_len ]=value_states
	self .seen_tokens +=new_len


	return self .key_cache [:,:,:self .seen_tokens ],self .value_cache [:,:,:self .seen_tokens ]

	def reset (self ):
	"""Reset cache position without deallocating the buffer."""
	self .seen_tokens =0


	def ring_attention (
	query :torch .Tensor ,
	key :torch .Tensor ,
	value :torch .Tensor ,
	chunk_size :int =4096 ,
	causal :bool =True ,
	)->torch .Tensor :
	"""
	Ring Attention for distributed long-context processing.
	Processes sequence in chunks with online softmax accumulation.

	Args:
	query: [batch, heads, seq_len, head_dim]
	key: [batch, heads, kv_len, head_dim]
	value: [batch, heads, kv_len, head_dim]
	chunk_size: Size of each attention chunk
	causal: Whether to apply causal masking

	Returns:
	Output tensor [batch, heads, seq_len, head_dim]
	"""
	batch_size ,num_heads ,seq_len ,head_dim =query .shape
	kv_len =key .shape [2 ]


	if seq_len <=chunk_size and kv_len <=chunk_size :
	qk_scale =head_dim **-0.25
	use_causal =causal and seq_len ==kv_len and seq_len >1

	if use_causal :
	return F .scaled_dot_product_attention (
	query qk_scale ,key qk_scale ,value ,
	is_causal =True ,scale =1.0 ,
	)
	elif causal and kv_len >seq_len :

	causal_mask =torch .zeros (seq_len ,kv_len ,device =query .device ,dtype =query .dtype )
	q_pos =torch .arange (seq_len ,device =query .device )+(kv_len -seq_len )
	k_pos =torch .arange (kv_len ,device =query .device )
	causal_mask =torch .where (k_pos .unsqueeze (0 )>q_pos .unsqueeze (1 ),float ('-inf'),0.0 )
	return F .scaled_dot_product_attention (
	query qk_scale ,key qk_scale ,value ,
	attn_mask =causal_mask ,scale =1.0 ,
	)
	else :
	return F .scaled_dot_product_attention (
	query qk_scale ,key qk_scale ,value ,
	is_causal =False ,scale =1.0 ,
	)


	scale =head_dim **-0.5
	output =torch .zeros_like (query )
	max_logits =torch .full ((batch_size ,num_heads ,seq_len ,1 ),float ('-inf'),device =query .device ,dtype =query .dtype )
	sum_exp =torch .zeros ((batch_size ,num_heads ,seq_len ,1 ),device =query .device ,dtype =query .dtype )


	if causal :
	q_positions =torch .arange (seq_len ,device =query .device )
	if kv_len >seq_len :
	q_positions =q_positions +(kv_len -seq_len )

	num_kv_chunks =(kv_len +chunk_size -1 )//chunk_size

	for kv_idx in range (num_kv_chunks ):
	kv_start =kv_idx *chunk_size
	kv_end =min ((kv_idx +1 )*chunk_size ,kv_len )

	key_chunk =key [:,:,kv_start :kv_end ,:]
	value_chunk =value [:,:,kv_start :kv_end ,:]

	attn_chunk =torch .matmul (query ,key_chunk .transpose (-1 ,-2 ))*scale

	if causal :

	k_positions =torch .arange (kv_start ,kv_end ,device =query .device )

	causal_mask =k_positions .unsqueeze (0 )>q_positions .unsqueeze (1 )
	attn_chunk =attn_chunk .masked_fill (causal_mask .unsqueeze (0 ).unsqueeze (0 ),float ('-inf'))

	chunk_max =attn_chunk .max (dim =-1 ,keepdim =True )[0 ]
	new_max =torch .maximum (max_logits ,chunk_max )

	exp_weights =torch .exp (attn_chunk -new_max )
	exp_sum_chunk =exp_weights .sum (dim =-1 ,keepdim =True )

	correction =torch .exp (max_logits -new_max )
	output =output *correction +torch .matmul (exp_weights ,value_chunk )
	sum_exp =sum_exp *correction +exp_sum_chunk
	max_logits =new_max

	output =output /(sum_exp +EPS )
	return output


	class MultiHeadLatentAttention (nn .Module ):
	"""
	Multi-Head Latent Attention (MLA) from DeepSeek-V2.
	Compresses KV cache using low-rank projections for memory efficiency.
	"""

	def __init__ (
	self ,
	hidden_size :int ,
	num_heads :int ,
	num_kv_heads :int =None ,
	head_dim :int =None ,
	kv_lora_rank :int =512 ,
	q_lora_rank :int =0 ,
	rope_theta :float =500000.0 ,
	max_position_embeddings :int =131072 ,
	use_ring_attention :bool =True ,
	ring_chunk_size :int =4096 ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_heads =num_heads
	self .num_kv_heads =num_kv_heads or num_heads
	self .head_dim =head_dim or hidden_size //num_heads
	self .kv_lora_rank =kv_lora_rank
	self .q_lora_rank =q_lora_rank
	self .use_ring_attention =use_ring_attention
	self .ring_chunk_size =ring_chunk_size

	self .num_key_value_groups =self .num_heads //self .num_kv_heads
	self .scale =self .head_dim **-0.5

	if q_lora_rank >0 :
	self .q_a_proj =nn .Linear (hidden_size ,q_lora_rank ,bias =False )
	self .q_b_proj =nn .Linear (q_lora_rank ,num_heads *self .head_dim ,bias =False )
	self .q_a_layernorm =LlamaRMSNorm (q_lora_rank )
	else :
	self .q_proj =nn .Linear (hidden_size ,num_heads *self .head_dim ,bias =False )

	self .kv_a_proj =nn .Linear (hidden_size ,kv_lora_rank +self .head_dim ,bias =False )
	self .kv_b_proj =nn .Linear (kv_lora_rank ,self .num_kv_heads self .head_dim 2 ,bias =False )
	self .kv_a_layernorm =LlamaRMSNorm (kv_lora_rank )

	self .o_proj =nn .Linear (num_heads *self .head_dim ,hidden_size ,bias =False )

	self .rotary_emb =YaRNRotaryEmbedding (
	dim =self .head_dim ,
	max_position_embeddings =max_position_embeddings ,
	base =rope_theta ,
	)

	self ._init_weights ()

	def _init_weights (self ):
	std =0.02
	for name ,module in self .named_modules ():
	if isinstance (module ,nn .Linear ):
	nn .init .normal_ (module .weight ,mean =0.0 ,std =std )

	def forward (
	self ,
	hidden_states :torch .Tensor ,
	attention_mask :Optional [torch .Tensor ]=None ,
	position_ids :Optional [torch .Tensor ]=None ,
	past_key_value :Optional [KVCache ]=None ,
	output_attentions :bool =False ,
	use_cache :bool =False ,
	)->Tuple [torch .Tensor ,Optional [torch .Tensor ],Optional [KVCache ]]:
	batch_size ,seq_len ,_ =hidden_states .shape

	if self .q_lora_rank >0 :
	q_compressed =self .q_a_layernorm (self .q_a_proj (hidden_states ))
	query_states =self .q_b_proj (q_compressed )
	else :
	query_states =self .q_proj (hidden_states )

	kv_compressed =self .kv_a_proj (hidden_states )
	kv_latent ,k_pe =kv_compressed .split ([self .kv_lora_rank ,self .head_dim ],dim =-1 )
	kv_latent =self .kv_a_layernorm (kv_latent )
	kv_states =self .kv_b_proj (kv_latent )

	query_states =query_states .view (batch_size ,seq_len ,self .num_heads ,self .head_dim ).transpose (1 ,2 )
	key_states ,value_states =kv_states .split (self .num_kv_heads *self .head_dim ,dim =-1 )
	key_states =key_states .view (batch_size ,seq_len ,self .num_kv_heads ,self .head_dim ).transpose (1 ,2 )
	value_states =value_states .view (batch_size ,seq_len ,self .num_kv_heads ,self .head_dim ).transpose (1 ,2 )

	if position_ids is None :
	position_ids =torch .arange (seq_len ,device =hidden_states .device ).unsqueeze (0 ).expand (batch_size ,-1 )
	if past_key_value is not None and past_key_value .seen_tokens >0 :
	position_ids =position_ids +past_key_value .seen_tokens

	cos ,sin =self .rotary_emb (hidden_states ,position_ids )
	query_states ,key_states =apply_rotary_pos_emb (query_states ,key_states ,cos ,sin )

	if past_key_value is not None :
	key_states ,value_states =past_key_value .update (
	key_states ,value_states ,
	self .ring_chunk_size if self .use_ring_attention else None
	)

	if self .use_ring_attention :

	if self .num_key_value_groups >1 :
	key_expanded =key_states .repeat_interleave (self .num_key_value_groups ,dim =1 )
	value_expanded =value_states .repeat_interleave (self .num_key_value_groups ,dim =1 )
	else :
	key_expanded =key_states
	value_expanded =value_states
	attn_output =ring_attention (
	query_states ,key_expanded ,value_expanded ,
	chunk_size =self .ring_chunk_size ,
	causal =True ,
	)
	else :


	qk_scale =self .head_dim **-0.25
	kv_len =key_states .shape [2 ]
	use_causal =(attention_mask is None and seq_len >1 and seq_len ==kv_len )

	attn_output =F .scaled_dot_product_attention (
	query_states *qk_scale ,
	key_states *qk_scale ,
	value_states ,
	attn_mask =attention_mask ,
	is_causal =use_causal ,
	scale =1.0 ,
	enable_gqa =(self .num_key_value_groups >1 ),
	)

	attn_output =attn_output .transpose (1 ,2 ).contiguous ().view (batch_size ,seq_len ,-1 )
	attn_output =self .o_proj (attn_output )

	return attn_output ,None ,past_key_value if use_cache else None


	class AuxLosslessMoERouter (nn .Module ):
	"""
	Aux-Lossless MoE Router with Shared Expert Isolation.
	Eliminates auxiliary loss while maintaining load balance through architecture.
	"""

	def __init__ (
	self ,
	hidden_size :int ,
	num_experts :int ,
	top_k :int =2 ,
	norm_topk_prob :bool =True ,
	):
	super ().__init__ ()
	self .num_experts =num_experts
	self .top_k =top_k
	self .norm_topk_prob =norm_topk_prob

	self .input_norm =LlamaRMSNorm (hidden_size )
	self .gate =nn .Linear (hidden_size ,num_experts ,bias =False )
	nn .init .normal_ (self .gate .weight ,mean =0.0 ,std =0.01 )

	self .expert_bias =nn .Parameter (torch .zeros (num_experts ))

	# Deep experts gate (4 deep experts)
	self .num_deep_experts = 4
	self .deep_gate = nn .Linear (hidden_size , self .num_deep_experts , bias =False )
	nn .init .normal_ (self .deep_gate .weight , mean =0.0 , std =0.01 )
	self .deep_expert_bias = nn .Parameter (torch .zeros (self .num_deep_experts ))

	def forward (self ,hidden_states :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ,torch .Tensor ]:
	batch_size ,seq_len ,hidden_dim =hidden_states .shape
	hidden_flat =hidden_states .view (-1 ,hidden_dim )

	hidden_norm =self .input_norm (hidden_flat )

	# Standard experts
	router_logits_std =self .gate (hidden_norm )
	biased_logits_std =router_logits_std +self .expert_bias

	# Deep experts
	router_logits_deep = self .deep_gate (hidden_norm )
	biased_logits_deep = router_logits_deep + self .deep_expert_bias

	# Concatenate: [batch*seq, num_experts + num_deep_experts]
	router_logits = torch .cat ([biased_logits_std , biased_logits_deep ], dim =-1 )

	router_probs =F .softmax (router_logits ,dim =-1 ,dtype =hidden_states .dtype )

	top_k_probs ,top_k_indices =torch .topk (router_probs ,self .top_k ,dim =-1 )

	if self .norm_topk_prob :
	top_k_probs =top_k_probs /(top_k_probs .sum (dim =-1 ,keepdim =True )+EPS )

	return top_k_probs ,top_k_indices ,router_logits


	class MoEExpert (nn .Module ):
	"""Single MoE Expert with SwiGLU activation."""

	def __init__ (self ,hidden_size :int ,intermediate_size :int ):
	super ().__init__ ()
	self .gate_proj =nn .Linear (hidden_size ,intermediate_size ,bias =False )
	self .up_proj =nn .Linear (hidden_size ,intermediate_size ,bias =False )
	self .down_proj =nn .Linear (intermediate_size ,hidden_size ,bias =False )
	self .act_fn =nn .SiLU ()
	self ._init_weights ()

	def _init_weights (self ):
	std =0.02
	nn .init .normal_ (self .gate_proj .weight ,mean =0.0 ,std =std )
	nn .init .normal_ (self .up_proj .weight ,mean =0.0 ,std =std )
	nn .init .normal_ (self .down_proj .weight ,mean =0.0 ,std =std *0.5 )

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	return self .down_proj (self .act_fn (self .gate_proj (x ))*self .up_proj (x ))


	class DeepMoEExpert (nn .Module ):
	"""Deep MoE Expert with multiple sequential SwiGLU transformations."""

	def __init__ (self ,hidden_size :int ,intermediate_size :int ,depth :int =2 ):
	super ().__init__ ()
	self .depth = depth

	self .gate_projs = nn .ModuleList ([nn .Linear (hidden_size if i == 0 else intermediate_size , intermediate_size , bias =False ) for i in range (depth )])
	self .up_projs = nn .ModuleList ([nn .Linear (hidden_size if i == 0 else intermediate_size , intermediate_size , bias =False ) for i in range (depth )])
	self .down_projs = nn .ModuleList ([nn .Linear (intermediate_size , intermediate_size if i < depth - 1 else hidden_size , bias =False ) for i in range (depth )])

	self .act_fn = nn .SiLU ()
	self ._init_weights ()

	def _init_weights (self ):
	std =0.02
	for g , u , d in zip (self .gate_projs , self .up_projs , self .down_projs ):
	nn .init .normal_ (g .weight ,mean =0.0 ,std =std )
	nn .init .normal_ (u .weight ,mean =0.0 ,std =std )
	nn .init .normal_ (d .weight ,mean =0.0 ,std =std *0.5 )

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	for i in range (self .depth ):
	# Optional residual connection if intermediate sizes match, but standard SwiGLU doesn't usually use them internally unless specified.
	# We'll stick to sequential application as defined: Input -> SwiGLU -> SwiGLU ... -> DownProj
	gate = self .act_fn (self .gate_projs [i ](x ))
	up = self .up_projs [i ](x )
	x = self .down_projs [i ](gate * up )
	return x


	class IsolatedSharedExpert (nn .Module ):
	"""
	Isolated Shared Expert that always processes all tokens.
	Separate from routed experts to prevent competition.
	"""

	def __init__ (self ,hidden_size :int ,intermediate_size :int ):
	super ().__init__ ()
	self .gate_proj =nn .Linear (hidden_size ,intermediate_size ,bias =False )
	self .up_proj =nn .Linear (hidden_size ,intermediate_size ,bias =False )
	self .down_proj =nn .Linear (intermediate_size ,hidden_size ,bias =False )
	self .act_fn =nn .SiLU ()
	self ._init_weights ()

	def _init_weights (self ):
	std =0.02
	nn .init .normal_ (self .gate_proj .weight ,mean =0.0 ,std =std )
	nn .init .normal_ (self .up_proj .weight ,mean =0.0 ,std =std )
	nn .init .normal_ (self .down_proj .weight ,mean =0.0 ,std =std *0.5 )

	def forward (self ,x :torch .Tensor )->torch .Tensor :
	return self .down_proj (self .act_fn (self .gate_proj (x ))*self .up_proj (x ))


	class AuxLosslessMoELayer (nn .Module ):
	"""
	Aux-Lossless MoE Layer with Isolated Shared Expert.
	No auxiliary loss needed - load balance maintained through isolation.
	"""

	def __init__ (
	self ,
	hidden_size :int ,
	intermediate_size :int ,
	num_experts :int =8 ,
	num_experts_per_tok :int =2 ,
	shared_expert_intermediate_size :int =None ,
	):
	super ().__init__ ()
	self .hidden_size =hidden_size
	self .num_experts =num_experts
	self .num_experts_per_tok =num_experts_per_tok

	self .router =AuxLosslessMoERouter (hidden_size ,num_experts ,num_experts_per_tok )

	self .experts =nn .ModuleList ([
	MoEExpert (hidden_size ,intermediate_size )
	for _ in range (num_experts )
	])

	# Deep Experts: Depths 2, 3, 4, 5
	self .num_deep_experts = 4
	self .deep_experts = nn .ModuleList ([
	DeepMoEExpert (hidden_size , intermediate_size , depth =d )
	for d in range (2 , 6 )
	])

	shared_size =shared_expert_intermediate_size or intermediate_size
	self .shared_expert =IsolatedSharedExpert (hidden_size ,shared_size )

	def forward (self ,hidden_states :torch .Tensor )->Tuple [torch .Tensor ,torch .Tensor ]:
	batch_size ,seq_len ,hidden_size =hidden_states .shape
	original_dtype =hidden_states .dtype
	hidden_flat =hidden_states .view (-1 ,hidden_size )
	num_tokens =hidden_flat .shape [0 ]

	top_k_probs ,top_k_indices ,router_logits =self .router (hidden_states )


	if hasattr (self ,'_utilization_tracker'):
	self ._utilization_tracker .record (top_k_indices )

	final_output =torch .zeros_like (hidden_flat )

	total_experts = self .num_experts + self .num_deep_experts
	for expert_idx in range (total_experts ):
	# Determine which expert list to use
	if expert_idx < self .num_experts :
	expert =self .experts [expert_idx ]
	else :
	expert =self .deep_experts [expert_idx - self .num_experts ]

	for k in range (self .num_experts_per_tok ):
	mask =(top_k_indices [:,k ]==expert_idx )
	if mask .any ():
	expert_input =hidden_flat [mask ]
	expert_output =expert (expert_input )
	weight =top_k_probs [mask ,k :k +1 ]
	weighted_output =(weight *expert_output ).to (original_dtype )
	final_output [mask ]=final_output [mask ]+weighted_output

	shared_output =self .shared_expert (hidden_flat )
	final_output =final_output +shared_output .to (original_dtype )

	final_output =final_output .view (batch_size ,seq_len ,hidden_size )



	aux_loss =self ._compute_aux_loss (router_logits ,top_k_indices ,num_tokens )

	return final_output ,aux_loss

	def _compute_aux_loss (
	self ,
	router_logits :torch .Tensor ,
	top_k_indices :torch .Tensor ,
	num_tokens :int ,
	)->torch .Tensor :
	"""
	Aux-lossless auxiliary loss.

	Uses z-loss to keep router logits from growing unboundedly (FP16 stability),
	plus a soft utilization penalty that activates only when experts go completely
	cold. The expert_bias parameter handles routine load balancing.
	"""

	z_loss =torch .logsumexp (router_logits ,dim =-1 ).square ().mean ()*0.0001

	# Add penalty for choosing deep experts
	# Depths are 2, 3, 4, 5 for indices (num_experts) to (num_experts + 3)
	# Cost is roughly proportional to depth
	deep_penalty = torch .tensor (0.0 , device =router_logits .device , dtype =router_logits .dtype )

	# Calculate how often each deep expert was selected
	# top_k_indices shape: [batch*seq, top_k]
	for i in range (self .num_deep_experts ):
	expert_idx = self .num_experts + i
	depth = i + 2 # depths 2, 3, 4, 5

	# Count how many times this deep expert was chosen in top-k
	selection_count = (top_k_indices == expert_idx ).sum ()

	# Simple penalty: deeper experts cost more
	# Multiplied by a small scalar to act as a soft deterrent
	# The model must truly need the depth to offset this loss increase
	deep_penalty += selection_count .float () * depth * 0.00005

	return z_loss + deep_penalty

	expert_mask =F .one_hot (top_k_indices ,self .num_experts ).float ()
	tokens_per_expert =expert_mask .sum (dim =(0 ,1 ))
	fraction_used =(tokens_per_expert >0 ).float ().mean ()
	utilization_loss =(1.0 -fraction_used )*0.01

	return z_loss +utilization_loss


	MoELayer =AuxLosslessMoELayer


	class MoELlamaDecoderLayer (nn .Module ):
	"""Decoder layer with MLA and Aux-Lossless MoE."""

	def __init__ (self ,config ,layer_idx :int ,moe_config :dict =None ):
	super ().__init__ ()
	self .hidden_size =config .hidden_size
	self .layer_idx =layer_idx

	use_ring =getattr (config ,'use_ring_attention',True )
	ring_chunk =getattr (config ,'ring_attention_chunk_size',4096 )

	num_kv_heads =getattr (config ,'num_key_value_heads',config .num_attention_heads //4 )

	self .self_attn =MultiHeadLatentAttention (
	hidden_size =config .hidden_size ,
	num_heads =config .num_attention_heads ,
	num_kv_heads =num_kv_heads ,
	rope_theta =getattr (config ,'rope_theta',500000.0 ),
	max_position_embeddings =config .max_position_embeddings ,
	use_ring_attention =use_ring ,
	ring_chunk_size =ring_chunk ,
	)

	self .input_layernorm =LlamaRMSNorm (config .hidden_size ,eps =config .rms_norm_eps )
	self .post_attention_layernorm =LlamaRMSNorm (config .hidden_size ,eps =config .rms_norm_eps )

	self .use_moe =moe_config and moe_config .get ('use_moe',False )
	moe_freq =moe_config .get ('moe_layer_freq',2 )if moe_config else 2

	if self .use_moe and layer_idx %moe_freq ==(moe_freq -1 ):
	self .mlp =AuxLosslessMoELayer (
	hidden_size =config .hidden_size ,
	intermediate_size =moe_config .get ('intermediate_size',config .intermediate_size ),
	num_experts =moe_config .get ('num_experts',8 ),
	num_experts_per_tok =moe_config .get ('num_experts_per_tok',2 ),
	)
	self .is_moe_layer =True
	else :
	self .mlp =MoEExpert (config .hidden_size ,config .intermediate_size )
	self .is_moe_layer =False

	def forward (
	self ,
	hidden_states :torch .Tensor ,
	attention_mask :Optional [torch .Tensor ]=None ,
	position_ids :Optional [torch .Tensor ]=None ,
	past_key_value :Optional [KVCache ]=None ,
	output_attentions :bool =False ,
	use_cache :bool =False ,
	)->Tuple [torch .Tensor ,Optional [torch .Tensor ],Optional [KVCache ],Optional [torch .Tensor ]]:

	residual =hidden_states
	hidden_states =self .input_layernorm (hidden_states )

	hidden_states ,_ ,present_key_value =self .self_attn (
	hidden_states =hidden_states ,
	attention_mask =attention_mask ,
	position_ids =position_ids ,
	past_key_value =past_key_value ,
	output_attentions =output_attentions ,
	use_cache =use_cache ,
	)
	hidden_states =residual +hidden_states

	residual =hidden_states
	hidden_states =self .post_attention_layernorm (hidden_states )

	aux_loss =None
	if self .is_moe_layer :
	hidden_states ,aux_loss =self .mlp (hidden_states )
	else :
	hidden_states =self .mlp (hidden_states )

	hidden_states =residual +hidden_states

	return hidden_states ,None ,present_key_value ,aux_loss


	@dataclass
	class MoELlamaModelOutput :
	last_hidden_state :torch .Tensor
	past_key_values :Optional [List [KVCache ]]=None
	hidden_states :Optional [Tuple [torch .Tensor ]]=None
	attentions :Optional [Tuple [torch .Tensor ]]=None
	aux_loss :Optional [torch .Tensor ]=None


	class MoELlamaModel (nn .Module ):
	"""MoE LLaMA Model with MLA and Ring Attention."""

	def __init__ (self ,config ,moe_config :dict =None ):
	super ().__init__ ()
	self .config =config
	self .moe_config =moe_config
	self .gradient_checkpointing =False

	self .embed_tokens =nn .Embedding (config .vocab_size ,config .hidden_size )

	self .layers =nn .ModuleList ([
	MoELlamaDecoderLayer (config ,layer_idx ,moe_config )
	for layer_idx in range (config .num_hidden_layers )
	])

	self .norm =LlamaRMSNorm (config .hidden_size ,eps =config .rms_norm_eps )

	self .num_moe_layers =sum (1 for layer in self .layers if layer .is_moe_layer )

	# ── Coconut: Continuous Thought components ──
	# Learned gate controls how much recurrent thought vs original input
	# to retain at each thinking step. Sigmoid output in [0,1].
	self .thought_gate = nn .Linear (config .hidden_size , 1 , bias =True )
	nn .init .constant_ (self .thought_gate .bias , -2.0 ) # Initialize gate biased toward original (sigmoid(-2)≈0.12)
	self .thought_layernorm = LlamaRMSNorm (config .hidden_size , eps =config .rms_norm_eps )

	# Halt head: dynamically decides when to stop thinking
	self .thought_halt_head = nn .Linear (config .hidden_size , 1 , bias =True )
	nn .init .constant_ (self .thought_halt_head .bias , -2.0 ) # Biased toward continuing to think initially

	# Fast Ponder Block for hyper-efficient 10x faster latent reasoning
	# Bypasses O(N^2) attention, uses pure deep SwiGLU logic
	self .fast_ponder_block = DeepMoEExpert (config .hidden_size , config .intermediate_size , depth =3 )

	self ._init_weights ()

	def _init_weights (self ):
	nn .init .normal_ (self .embed_tokens .weight ,mean =0.0 ,std =0.02 )

	def gradient_checkpointing_enable (self ):
	"""Enable gradient checkpointing for memory efficiency."""
	self .gradient_checkpointing =True

	def gradient_checkpointing_disable (self ):
	"""Disable gradient checkpointing."""
	self .gradient_checkpointing =False

	def forward (
	self ,
	input_ids :Optional [torch .Tensor ]=None ,
	attention_mask :Optional [torch .Tensor ]=None ,
	position_ids :Optional [torch .Tensor ]=None ,
	inputs_embeds :Optional [torch .Tensor ]=None ,
	past_key_values :Optional [List [KVCache ]]=None ,
	use_cache :bool =False ,
	output_attentions :bool =False ,
	output_hidden_states :bool =False ,
	return_dict :bool =True ,
	cache_position :Optional [torch .Tensor ]=None ,
	thinking_depth :int =0 ,
	)->Union [Tuple ,MoELlamaModelOutput ]:

	if inputs_embeds is None :
	inputs_embeds =self .embed_tokens (input_ids )

	hidden_states =inputs_embeds
	batch_size ,seq_len =hidden_states .shape [:2 ]

	if position_ids is None :
	position_ids =torch .arange (seq_len ,device =hidden_states .device ).unsqueeze (0 ).expand (batch_size ,-1 )

	if past_key_values is None :
	past_key_values =[None ]*len (self .layers )

	all_hidden_states =()if output_hidden_states else None
	all_attentions =()if output_attentions else None
	next_cache =[]if use_cache else None
	total_aux_loss =torch .tensor (0.0 ,device =hidden_states .device ,dtype =hidden_states .dtype )

	for idx ,layer in enumerate (self .layers ):
	if output_hidden_states :
	all_hidden_states =all_hidden_states +(hidden_states ,)

	if self .gradient_checkpointing and self .training and not use_cache :

	def create_custom_forward (module ):
	def custom_forward (*inputs ):
	return module (*inputs )
	return custom_forward

	layer_outputs =torch .utils .checkpoint .checkpoint (
	create_custom_forward (layer ),
	hidden_states ,
	attention_mask ,
	position_ids ,
	past_key_values [idx ],
	output_attentions ,
	use_cache ,
	use_reentrant =False ,
	)
	hidden_states ,attn_weights ,present_key_value ,aux_loss =layer_outputs
	else :
	hidden_states ,attn_weights ,present_key_value ,aux_loss =layer (
	hidden_states =hidden_states ,
	attention_mask =attention_mask ,
	position_ids =position_ids ,
	past_key_value =past_key_values [idx ],
	output_attentions =output_attentions ,
	use_cache =use_cache ,
	)

	if use_cache :
	next_cache .append (present_key_value )

	if aux_loss is not None :
	total_aux_loss =total_aux_loss +aux_loss

	if output_attentions and attn_weights is not None :
	all_attentions =all_attentions +(attn_weights ,)

	# ── Coconut: Continuous Thought Loop ──
	# After the normal pass, loop hidden states back through the
	# transformer layers for extra computation in latent space.
	# No tokens are decoded — pure continuous reasoning.
	if thinking_depth > 0 :
	original_hidden = hidden_states .clone ()
	thought_position_ids = torch .arange (
	seq_len , device =hidden_states .device
	).unsqueeze (0 ).expand (batch_size , -1 )

	for thought_step in range (thinking_depth ):
	# Check if we should halt thinking (only during inference or if forced)
	# We evaluate the halt head on the current hidden state of the last token
	halt_logits = self .thought_halt_head (hidden_states [:, -1:, :])
	halt_prob = torch .sigmoid (halt_logits )

	# If during generation we decide to stop, break early
	if not self .training and (halt_prob > 0.5 ).all ():
	break

	# Normalize before processing
	hidden_states = self .thought_layernorm (hidden_states )

	# Run purely through the attention-free fast ponder block
	# This achieves ~10x speedup by completely bypassing the O(N^2) self-attention stack
	hidden_states = self .fast_ponder_block (hidden_states )

	# Gated residual: blend thought with original
	# gate ∈ [0,1], initialized small so early training
	# stays close to original behavior
	gate = torch .sigmoid (self .thought_gate (hidden_states ))
	hidden_states = gate * hidden_states + (1.0 - gate ) * original_hidden

	hidden_states =self .norm (hidden_states )

	if output_hidden_states :
	all_hidden_states =all_hidden_states +(hidden_states ,)

	return MoELlamaModelOutput (
	last_hidden_state =hidden_states ,
	past_key_values =next_cache if use_cache else None ,
	hidden_states =all_hidden_states ,
	attentions =all_attentions ,
	aux_loss =total_aux_loss ,
	)


	@dataclass
	class CausalLMOutput :
	loss :Optional [torch .Tensor ]=None
	logits :torch .Tensor =None
	past_key_values :Optional [List [KVCache ]]=None
	hidden_states :Optional [Tuple [torch .Tensor ]]=None
	attentions :Optional [Tuple [torch .Tensor ]]=None
	aux_loss :Optional [torch .Tensor ]=None


	class MoELlamaForCausalLM (nn .Module ):
	"""MoE LLaMA for Causal Language Modeling with MLA and Ring Attention."""

	def __init__ (self ,config ,moe_config :dict =None ):
	super ().__init__ ()
	self .config =config
	self .moe_config =moe_config

	self .model =MoELlamaModel (config ,moe_config )
	self .lm_head =nn .Linear (config .hidden_size ,config .vocab_size ,bias =False )

	if getattr (config ,'tie_word_embeddings',True ):
	self .lm_head .weight =self .model .embed_tokens .weight

	self .apply (self ._init_weights )

	def _init_weights (self ,module ):
	std =0.02
	if isinstance (module ,nn .Linear ):
	nn .init .normal_ (module .weight ,mean =0.0 ,std =std )
	if module .bias is not None :
	nn .init .zeros_ (module .bias )
	elif isinstance (module ,nn .Embedding ):
	nn .init .normal_ (module .weight ,mean =0.0 ,std =std )

	def get_input_embeddings (self )->nn .Embedding :
	return self .model .embed_tokens

	def set_input_embeddings (self ,value :nn .Embedding ):
	self .model .embed_tokens =value

	def get_output_embeddings (self )->nn .Linear :
	return self .lm_head

	def set_output_embeddings (self ,new_embeddings :nn .Linear ):
	self .lm_head =new_embeddings

	def gradient_checkpointing_enable (self ):
	"""Enable gradient checkpointing for memory efficiency."""
	self .model .gradient_checkpointing_enable ()

	def gradient_checkpointing_disable (self ):
	"""Disable gradient checkpointing."""
	self .model .gradient_checkpointing_disable ()

	def prepare_inputs_for_generation (
	self ,
	input_ids :torch .Tensor ,
	past_key_values :Optional [List [KVCache ]]=None ,
	attention_mask :Optional [torch .Tensor ]=None ,
	inputs_embeds :Optional [torch .Tensor ]=None ,
	**kwargs ,
	)->dict :
	if past_key_values is not None :
	input_ids =input_ids [:,-1 :]

	position_ids =kwargs .get ("position_ids",None )
	if attention_mask is not None and position_ids is None :
	position_ids =attention_mask .long ().cumsum (-1 )-1
	position_ids .masked_fill_ (attention_mask ==0 ,1 )
	if past_key_values is not None :
	position_ids =position_ids [:,-1 :]

	return {
	"input_ids":input_ids ,
	"past_key_values":past_key_values ,
	"use_cache":kwargs .get ("use_cache",True ),
	"position_ids":position_ids ,
	"attention_mask":attention_mask ,
	}

	def forward (
	self ,
	input_ids :Optional [torch .Tensor ]=None ,
	attention_mask :Optional [torch .Tensor ]=None ,
	position_ids :Optional [torch .Tensor ]=None ,
	inputs_embeds :Optional [torch .Tensor ]=None ,
	labels :Optional [torch .Tensor ]=None ,
	past_key_values :Optional [List [KVCache ]]=None ,
	use_cache :bool =False ,
	output_attentions :bool =False ,
	output_hidden_states :bool =False ,
	return_dict :bool =True ,
	cache_position :Optional [torch .Tensor ]=None ,
	thinking_depth :int =0 ,
	**kwargs ,
	)->Union [Tuple ,CausalLMOutput ]:

	outputs =self .model (
	input_ids =input_ids ,
	attention_mask =attention_mask ,
	position_ids =position_ids ,
	inputs_embeds =inputs_embeds ,
	past_key_values =past_key_values ,
	use_cache =use_cache ,
	output_attentions =output_attentions ,
	output_hidden_states =output_hidden_states ,
	return_dict =True ,
	cache_position =cache_position ,
	thinking_depth =thinking_depth ,
	)

	hidden_states =outputs .last_hidden_state
	aux_loss =outputs .aux_loss

	logits =self .lm_head (hidden_states )

	loss =None
	if labels is not None :
	shift_logits =logits [...,:-1 ,:].contiguous ()
	shift_labels =labels [...,1 :].contiguous ()

	if shift_labels .dtype !=torch .long :
	shift_labels =shift_labels .long ()

	valid_mask =(shift_labels !=-100 )
	num_valid =valid_mask .sum ().item ()

	if num_valid >0 :
	loss_fct =nn .CrossEntropyLoss (ignore_index =-100 )
	loss =loss_fct (
	shift_logits .view (-1 ,shift_logits .size (-1 )),
	shift_labels .view (-1 )
	)
	loss =torch .clamp (loss ,min =0.0 ,max =100.0 )

	else :
	loss =torch .tensor (0.0 ,device =logits .device ,dtype =logits .dtype ,requires_grad =True )

	return CausalLMOutput (
	loss =loss ,
	logits =logits ,
	past_key_values =outputs .past_key_values ,
	hidden_states =outputs .hidden_states ,
	attentions =outputs .attentions ,
	aux_loss =aux_loss ,
	)

	@torch .no_grad ()
	def generate (
	self ,
	input_ids :torch .Tensor ,
	max_new_tokens :int =100 ,
	temperature :float =1.0 ,
	top_k :int =50 ,
	top_p :float =0.9 ,
	do_sample :bool =True ,
	pad_token_id :Optional [int ]=None ,
	eos_token_id :Optional [int ]=None ,
	attention_mask :Optional [torch .Tensor ]=None ,
	thinking_depth :int =0 ,
	**kwargs ,
	)->torch .Tensor :
	batch_size =input_ids .shape [0 ]
	device =input_ids .device

	past_key_values =None
	is_prefill =True # Deep thinking only on first pass (full context)

	if attention_mask is None :
	attention_mask =torch .ones_like (input_ids )

	for _ in range (max_new_tokens ):
	model_inputs =self .prepare_inputs_for_generation (
	input_ids ,
	past_key_values =past_key_values ,
	attention_mask =attention_mask ,
	)

	# Apply thinking depth only on prefill, not per-token steps
	current_depth = thinking_depth if is_prefill else 0
	outputs =self .forward (**model_inputs ,use_cache =True ,return_dict =True ,thinking_depth =current_depth )
	is_prefill =False

	next_token_logits =outputs .logits [:,-1 ,:]

	if temperature !=1.0 :
	next_token_logits =next_token_logits /temperature

	if do_sample :
	if top_k >0 :
	indices_to_remove =next_token_logits <torch .topk (next_token_logits ,top_k )[0 ][...,-1 ,None ]
	next_token_logits [indices_to_remove ]=float ('-inf')

	if top_p <1.0 :
	sorted_logits ,sorted_indices =torch .sort (next_token_logits ,descending =True )
	cumulative_probs =torch .cumsum (F .softmax (sorted_logits ,dim =-1 ),dim =-1 )
	sorted_indices_to_remove =cumulative_probs >top_p
	sorted_indices_to_remove [...,1 :]=sorted_indices_to_remove [...,:-1 ].clone ()
	sorted_indices_to_remove [...,0 ]=0
	indices_to_remove =sorted_indices_to_remove .scatter (1 ,sorted_indices ,sorted_indices_to_remove )
	next_token_logits [indices_to_remove ]=float ('-inf')

	probs =F .softmax (next_token_logits ,dim =-1 )
	next_tokens =torch .multinomial (probs ,num_samples =1 ).squeeze (-1 )
	else :
	next_tokens =torch .argmax (next_token_logits ,dim =-1 )

	input_ids =torch .cat ([input_ids ,next_tokens .unsqueeze (-1 )],dim =-1 )
	attention_mask =torch .cat ([attention_mask ,torch .ones ((batch_size ,1 ),device =device )],dim =-1 )

	past_key_values =outputs .past_key_values

	if eos_token_id is not None and (next_tokens ==eos_token_id ).all ():
	break

	return input_ids


	==============================================================================
	MODELS.XORON
	==============================================================================

	logger =logging .getLogger (__name__ )




	MAX_HIDDEN =10000.0


	def safe_clamp_tensor (x :torch .Tensor ,max_val :float =MAX_HIDDEN )->torch .Tensor :
	"""Clamp tensor values for FP16 safety, handling NaN/Inf properly.

	WARNING: Only use for linear/hidden states, NOT for attention scores before softmax!
	For attention scores, use a max of ~11.0 to prevent exp() overflow.

	CRITICAL: torch.clamp does NOT fix NaN! clamp(nan, -10, 10) = nan
	Must use nan_to_num first.
	"""
	if x is None or x .numel ()==0 :
	return x
	x =torch .nan_to_num (x ,nan =0.0 ,posinf =max_val ,neginf =-max_val )
	return x .clamp (-max_val ,max_val )



	COMPONENT_GROUPS ={
	'vision':['vision_encoder','projector'],
	'video':['video_encoder'],
	'audio':['audio_encoder','audio_decoder','audio_projector','waveform_decoder'],
	'speech':['waveform_decoder'],
	'llm':['llm'],
	'cross_attention':['cross_attention_layers'],
	'image_generation':['generator'],
	'video_generation':['video_generator'],
	'modality_markers':['image_start','image_end','video_start','video_end','audio_start','audio_end'],
	}


	class MultimodalModelOutput (dict ):
	"""Output class for multimodal model."""
	def __getattr__ (self ,name ):
	try :
	return self [name ]
	except KeyError :
	raise AttributeError (f"'{type (self ).__name__ }' has no attribute '{name }'")

	def __setattr__ (self ,name ,value ):
	self [name ]=value


	class XoronMultimodalModel (nn .Module ):
	"""
	Xoron-Dev: Complete multimodal model with:
	- Image/video understanding (CLIP)
	- Text generation (MoE LLM)
	- Image/video generation (MobileDiffusion)
	- Voice understanding and generation (ASR/TTS)
	- Cross-attention for multimodal fusion
	- LoRA support for efficient fine-tuning
	- Flash Attention for faster training
	- Model Parallelism support for multi-GPU training
	"""

	def __init__ (self ,config :XoronConfig ,device_map :Dict [str ,str ]=None ):
	super ().__init__ ()
	self .config =config
	self .device_map =device_map



	if device_map is not None :
	device_values =[v for v in device_map .values ()if isinstance (v ,str )]
	self ._model_parallel =len (set (device_values ))>1
	else :
	self ._model_parallel =False




	logger .info ("Initializing Xoron-Dev Multimodal Model Build")
	if self ._model_parallel :
	logger .info (" ⚡ Model Parallelism: ENABLED")


	self .vision_encoder =VisionEncoder (config .vision_model_name ,freeze =config .freeze_vision )


	self .video_encoder =VideoEncoder (self .vision_encoder ,max_frames =config .video_max_frames )




	logger .info ("Building SOTA Audio Encoder...")
	self .audio_encoder =AudioEncoder (
	hidden_size =config .hidden_size ,
	n_mels =80 ,
	max_audio_length =3000 ,
	use_raw_waveform =getattr (config ,'use_raw_waveform',True ),
	)

	logger .info ("Building SOTA Audio Decoder...")
	self .audio_decoder =AudioDecoder (
	hidden_size =config .hidden_size ,
	n_mels =80 ,
	max_audio_length =1000 ,
	)

	logger .info ("Building Raw Waveform Decoder (Speech-to-Speech)...")
	self .waveform_decoder =RawWaveformDecoder (
	hidden_size =config .hidden_size ,
	sample_rate =getattr (config ,'audio_sample_rate',16000 ),
	)


	llm_config =LlamaConfig (
	vocab_size =config .vocab_size ,
	hidden_size =config .hidden_size ,
	intermediate_size =config .intermediate_size ,
	num_hidden_layers =config .num_layers ,
	num_attention_heads =config .num_heads ,
	max_position_embeddings =config .max_position_embeddings ,
	rms_norm_eps =1e-6 ,
	tie_word_embeddings =getattr (config ,'tie_word_embeddings',True ),
	pad_token_id =0 ,
	)
	llm_config .use_flash_attention =config .use_flash_attention


	llm_config .use_ring_attention =getattr (config ,'use_ring_attention',True )
	llm_config .ring_attention_chunk_size =getattr (config ,'ring_attention_chunk_size',4096 )

	moe_config ={
	'use_moe':config .use_moe ,
	'num_experts':config .num_experts ,
	'num_experts_per_tok':config .num_experts_per_tok ,
	'moe_layer_freq':config .moe_layer_freq ,
	'intermediate_size':config .intermediate_size ,

	}




	logger .info (f"Building LLM Core: {config .hidden_size }d, {config .num_layers }L")
	logger .info (f" 📏 Context: {config .max_position_embeddings //1024 }K positions")
	if config .use_ring_attention :
	logger .info (f" 🔄 Ring Attention Enabled (chunk size: {config .ring_attention_chunk_size })")
	logger .info (f" 🎯 MoE: {config .num_experts } experts, top-{config .num_experts_per_tok }")

	self .llm =MoELlamaForCausalLM (llm_config ,moe_config )
	logger .info (f" ✅ MoE layers initialized: {self .llm .model .num_moe_layers }/{config .num_layers }")




	self .projector =MultimodalProjector (
	self .vision_encoder .hidden_size ,
	config .hidden_size ,
	config .num_vision_tokens
	)
	logger .info (f" 🔗 Projector initialized: {self .vision_encoder .hidden_size } -> {config .hidden_size }")


	self .audio_projector =nn .Linear (config .hidden_size ,config .hidden_size )


	self .image_start =nn .Parameter (torch .randn (1 ,1 ,config .hidden_size )*0.02 )
	self .image_end =nn .Parameter (torch .randn (1 ,1 ,config .hidden_size )*0.02 )
	self .video_start =nn .Parameter (torch .randn (1 ,1 ,config .hidden_size )*0.02 )
	self .video_end =nn .Parameter (torch .randn (1 ,1 ,config .hidden_size )*0.02 )
	self .audio_start =nn .Parameter (torch .randn (1 ,1 ,config .hidden_size )*0.02 )
	self .audio_end =nn .Parameter (torch .randn (1 ,1 ,config .hidden_size )*0.02 )




	self .cross_attention_layers =None
	if config .use_cross_attention :
	logger .info (f"Building Cross-Attention Fusion ({config .cross_attention_layers } layers)...")
	self .cross_attention_layers =nn .ModuleList ([
	MultimodalFusionLayer (
	hidden_size =config .hidden_size ,
	num_heads =config .cross_attention_heads ,
	dropout =config .cross_attention_dropout ,
	use_flash_attention =config .use_flash_attention ,
	)
	for _ in range (config .cross_attention_layers )
	])
	logger .info (f" ✅ Cross-attention: {config .cross_attention_layers } layers, {config .cross_attention_heads } heads")


	self .generator =None
	if config .enable_generation :
	logger .info ("Building MobileDiffusion Generators (Image & Video)...")

	self .generator =MobileDiffusionGenerator (
	latent_channels =config .generation_latent_channels ,
	base_channels =config .generation_base_channels ,
	context_dim =config .hidden_size ,
	num_inference_steps =config .generation_inference_steps ,
	image_size =config .image_max_size ,
	)


	self .video_generator =None
	if config .enable_generation :
	self .video_generator =MobileVideoDiffusion (
	latent_channels =config .generation_latent_channels ,
	base_channels =config .generation_base_channels //2 ,
	context_dim =config .hidden_size ,
	num_frames =config .video_max_frames ,
	image_size =config .video_max_size ,
	num_inference_steps =config .generation_inference_steps ,
	)

	self .num_vision_tokens =config .num_vision_tokens
	self .video_max_frames =config .video_max_frames
	self .lora_applied =False

	self ._print_stats ()
	logger .info ("Xoron-Dev Multimodal Model Build Complete")


	def apply_model_parallel (self ,device_map :Dict [str ,str ]):
	"""Apply Model Parallelism by sharding components across devices.

	Trained components get their layers split across all training GPUs.
	Frozen components go to CPU. Small components (projectors, markers)
	go to the primary GPU.
	"""
	self .device_map =device_map
	training_gpus = device_map .get ('training_gpus', ['cuda:0'])
	primary = device_map .get ('primary', 'cuda:0')

	if len (training_gpus ) <= 1 and not any (v == 'cpu' for v in device_map .values () if isinstance (v, str)):
	logger .info (" ℹ️ Single device - no model parallelism needed")
	return self

	self ._model_parallel = True
	logger .info ("Applying Model Parallelism (layer sharding)...")

	def _shard_module (module, name, gpus):
	"""Shard a module's sub-layers across GPUs."""
	# Find shardable sub-layers (nn.ModuleList children)
	layer_lists = []
	for attr_name in dir (module):
	attr = getattr (module, attr_name, None)
	if isinstance (attr, nn .ModuleList) and len (attr) > 0:
	layer_lists .append ((attr_name, attr))

	if layer_lists:
	# Shard the largest ModuleList across GPUs
	layer_lists .sort (key=lambda x: len (x[1]), reverse=True)
	list_name, layers = layer_lists [0]
	for i, layer in enumerate (layers):
	target_gpu = gpus [i % len (gpus)]
	layer .to (target_gpu)
	# Put remaining params on primary GPU
	for param_name, param in module .named_parameters ():
	if not any (f'{list_name}.' in param_name for _ in [1]):
	param .data = param .data .to (gpus [0])
	logger .info (f" ✅ {name}: {len(layers)} layers sharded across {gpus}")
	else:
	# No layers to shard — put whole module on first GPU
	module .to (gpus [0])
	logger .info (f" ✅ {name} -> {gpus[0]}")

	# Map component names to actual attributes
	component_attrs = {
	'vision_encoder': 'vision_encoder',
	'video_encoder': 'video_encoder',
	'audio_encoder': 'audio_encoder',
	'audio_decoder': 'audio_decoder',
	'waveform_decoder': 'waveform_decoder',
	'projector': 'projector',
	'audio_projector': 'audio_projector',
	'llm': 'llm',
	'cross_attention': 'cross_attention_layers',
	'generator': 'generator',
	'video_generator': 'video_generator',
	}

	for comp_name, attr_name in component_attrs .items ():
	comp = getattr (self, attr_name, None)
	if comp is None:
	continue
	target = device_map .get (comp_name, 'cpu')
	if target == 'cpu':
	comp .to ('cpu')
	logger .info (f" ❄️ {comp_name} -> cpu (frozen)")
	else:
	# Shard across all training GPUs
	_shard_module (comp, comp_name, training_gpus)

	# Modality markers → primary GPU
	marker_device = device_map .get ('modality_markers', primary)
	if marker_device != 'cpu':
	marker_device = primary
	for marker_name in ['image_start', 'image_end', 'video_start', 'video_end', 'audio_start', 'audio_end']:
	marker = getattr (self, marker_name, None)
	if marker is not None:
	setattr (self, marker_name, nn .Parameter (marker .data .to (marker_device)))
	logger .info (f" ✅ Modality markers -> {marker_device}")

	logger .info ("Model Parallelism applied successfully!")
	return self

	def get_llm_device (self ):
	"""Get the device where LLM is located."""
	if self .device_map is not None :
	return torch .device (self .device_map ['llm'])
	return next (self .llm .parameters ()).device

	def generate (self ,args ,*kwargs ):
	"""
	Delegates generation to the internal LLM.
	This allows the model to be treated as a causal LM in many pipelines.
	"""
	return self .llm .generate (args ,*kwargs )

	def get_encoder_device (self ):
	"""Get the device where encoders are located."""
	if self .device_map is not None :
	return torch .device (self .device_map ['vision_encoder'])
	return next (self .vision_encoder .parameters ()).device

	def apply_lora (self ):
	"""
	Apply LoRA to the LLM and optionally cross-attention layers.

	MEMORY OPTIMIZATION:
	- LoRA layers share base weights (no cloning)
	- Base weights in LoRA layers are frozen (requires_grad=False)
	- LoRA params (A, B, magnitude) are always trainable

	NOTE: This does NOT freeze other components!
	Component freezing is handled separately by freeze_components() based on
	training mode (--text, --video, --image, --voice flags).

	This allows PARALLEL FINE-TUNING:
	- LoRA adapters on LLM for efficient adaptation
	- Full weight training on active components (vision, audio, etc.)
	"""
	if self .lora_applied :
	logger .warning ("LoRA already applied")
	return

	if not self .config .use_lora :
	logger .info ("LoRA disabled in config")
	return

	lora_config =LoRAConfig (
	r =self .config .lora_r ,
	lora_alpha =self .config .lora_alpha ,
	lora_dropout =self .config .lora_dropout ,
	target_modules =list (self .config .lora_target_modules ),
	enable_lora =True ,
	)

	logger .info ("Applying LoRA to LLM Core...")
	self .llm =apply_lora_to_model (self .llm ,lora_config )

	if self .cross_attention_layers is not None :
	logger .info ("Applying LoRA to cross-attention layers...")
	cross_attn_lora_config =LoRAConfig (
	r =lora_config .r ,
	lora_alpha =lora_config .lora_alpha ,
	lora_dropout =lora_config .lora_dropout ,
	target_modules =['q_proj','k_proj','v_proj','o_proj'],
	enable_lora =True ,
	)
	for i ,layer in enumerate (self .cross_attention_layers ):
	self .cross_attention_layers [i ]=apply_lora_to_model (layer ,cross_attn_lora_config )

	self .lora_applied =True





	self ._print_stats ()

	def get_trainable_params (self ):
	"""
	Get trainable parameters, respecting LoRA settings and component freezing.

	If train_lora_only=True and LoRA is applied:
	- Freezes all non-LoRA params
	- Returns only LoRA params
	Otherwise:
	- Returns all params with requires_grad=True
	- This includes both LoRA params AND unfrozen component weights
	- Allows parallel fine-tuning: LoRA + full weights on active components
	"""
	if self .config .train_lora_only and self .lora_applied :

	freeze_non_lora_params (self )
	return get_lora_parameters (self )

	return [p for p in self .parameters ()if p .requires_grad ]

	def _print_stats (self ):
	total =sum (p .numel ()for p in self .parameters ())
	trainable =sum (p .numel ()for p in self .parameters ()if p .requires_grad )

	logger .info ("Model Statistics:")
	logger .info (f" Total parameters: {total /1e6 :.1f}M")
	logger .info (f" Trainable parameters: {trainable /1e6 :.1f}M")
	if self .lora_applied :
	lora_params =sum (p .numel ()for n ,p in self .named_parameters ()if 'lora_'in n )
	logger .info (f" LoRA parameters: {lora_params /1e6 :.2f}M")

	def encode_image (self ,pixel_values :torch .Tensor )->torch .Tensor :
	encoder_device =self .get_encoder_device ()
	pixel_values =pixel_values .to (encoder_device )
	vision_features =self .vision_encoder (pixel_values )
	projected =self .projector (vision_features )
	llm_device =self .get_llm_device ()
	return projected .to (llm_device )

	def encode_video (self ,video_frames :torch .Tensor )->torch .Tensor :
	encoder_device =self .get_encoder_device ()
	video_frames =video_frames .to (encoder_device )
	video_features =self .video_encoder (video_frames )
	projected =self .projector (video_features )
	llm_device =self .get_llm_device ()
	return projected .to (llm_device )

	def encode_audio (self ,audio_features :torch .Tensor )->torch .Tensor :
	encoder_device =self .get_encoder_device ()
	audio_features =audio_features .to (encoder_device )
	audio_embeds =self .audio_encoder (audio_features )
	projected =self .audio_projector (audio_embeds )
	llm_device =self .get_llm_device ()
	return projected .to (llm_device )

	def get_text_embeddings (self ,input_ids :torch .Tensor ,attention_mask :torch .Tensor =None )->torch .Tensor :
	llm_device =self .get_llm_device ()
	input_ids =input_ids .to (llm_device )
	embeddings =self .llm .model .embed_tokens (input_ids )
	return embeddings

	def _apply_cross_attention (
	self ,
	text_embeds :torch .Tensor ,
	image_embeds :torch .Tensor =None ,
	video_embeds :torch .Tensor =None ,
	audio_embeds :torch .Tensor =None ,
	)->torch .Tensor :
	if self .cross_attention_layers is None :
	return text_embeds

	for fusion_layer in self .cross_attention_layers :

	text_embeds ,_ =fusion_layer (
	text_hidden =text_embeds ,
	image_hidden =image_embeds ,
	video_hidden =video_embeds ,
	audio_hidden =audio_embeds ,
	use_cache =False ,
	)

	return text_embeds

	def forward (
	self ,
	input_ids :torch .Tensor ,
	attention_mask :torch .Tensor =None ,
	pixel_values :torch .Tensor =None ,
	video_frames :torch .Tensor =None ,
	audio_features :torch .Tensor =None ,
	labels :torch .Tensor =None ,
	):
	"""Forward pass - FP16 native."""
	batch_size =input_ids .shape [0 ]
	llm_device =self .get_llm_device ()

	input_ids_llm =input_ids .to (llm_device )
	text_embeds =self .llm .model .embed_tokens (input_ids_llm )
	text_embeds =safe_clamp_tensor (text_embeds )

	device =text_embeds .device

	if attention_mask is not None :
	attention_mask =attention_mask .to (device )
	if labels is not None :
	labels =labels .to (device )

	image_embeds_for_cross =None
	video_embeds_for_cross =None
	audio_embeds_for_cross =None

	def has_content (tensor ):
	if tensor is None :
	return False
	if not isinstance (tensor ,torch .Tensor ):
	return False
	try :
	if tensor .numel ()==0 :
	return False
	return bool (tensor .any ())
	except Exception :
	return False

	if has_content (pixel_values ):
	try :
	image_embeds =self .encode_image (pixel_values )
	image_embeds =safe_clamp_tensor (image_embeds )
	image_embeds_for_cross =image_embeds
	image_start =self .image_start .expand (batch_size ,-1 ,-1 )
	image_end =self .image_end .expand (batch_size ,-1 ,-1 )
	image_embeds =torch .cat ([image_start ,image_embeds ,image_end ],dim =1 )
	text_embeds =torch .cat ([image_embeds ,text_embeds ],dim =1 )
	text_embeds =safe_clamp_tensor (text_embeds )

	if attention_mask is not None :
	image_mask =torch .ones (batch_size ,image_embeds .shape [1 ],device =device )
	attention_mask =torch .cat ([image_mask ,attention_mask ],dim =1 )

	if labels is not None :
	image_labels =torch .full ((batch_size ,image_embeds .shape [1 ]),-100 ,device =device ,dtype =labels .dtype )
	labels =torch .cat ([image_labels ,labels ],dim =1 )
	except Exception as e :
	logger .debug (f"Image encoding skipped: {e }")

	if has_content (video_frames ):
	try :
	video_embeds =self .encode_video (video_frames )
	video_embeds =safe_clamp_tensor (video_embeds )
	video_embeds_for_cross =video_embeds
	video_start =self .video_start .expand (batch_size ,-1 ,-1 )
	video_end =self .video_end .expand (batch_size ,-1 ,-1 )
	video_embeds =torch .cat ([video_start ,video_embeds ,video_end ],dim =1 )
	text_embeds =torch .cat ([video_embeds ,text_embeds ],dim =1 )
	text_embeds =safe_clamp_tensor (text_embeds )

	if attention_mask is not None :
	video_mask =torch .ones (batch_size ,video_embeds .shape [1 ],device =device )
	attention_mask =torch .cat ([video_mask ,attention_mask ],dim =1 )

	if labels is not None :
	video_labels =torch .full ((batch_size ,video_embeds .shape [1 ]),-100 ,device =device ,dtype =labels .dtype )
	labels =torch .cat ([video_labels ,labels ],dim =1 )
	except Exception as e :
	logger .debug (f"Video encoding skipped: {e }")

	if has_content (audio_features ):
	try :
	audio_embeds =self .encode_audio (audio_features )
	audio_embeds =safe_clamp_tensor (audio_embeds )
	audio_embeds_for_cross =audio_embeds
	audio_start =self .audio_start .expand (batch_size ,-1 ,-1 )
	audio_end =self .audio_end .expand (batch_size ,-1 ,-1 )
	audio_embeds =torch .cat ([audio_start ,audio_embeds ,audio_end ],dim =1 )
	text_embeds =torch .cat ([audio_embeds ,text_embeds ],dim =1 )
	text_embeds =safe_clamp_tensor (text_embeds )

	if attention_mask is not None :
	audio_mask =torch .ones (batch_size ,audio_embeds .shape [1 ],device =device )
	attention_mask =torch .cat ([audio_mask ,attention_mask ],dim =1 )

	if labels is not None :
	audio_labels =torch .full ((batch_size ,audio_embeds .shape [1 ]),-100 ,device =device ,dtype =labels .dtype )
	labels =torch .cat ([audio_labels ,labels ],dim =1 )
	except Exception as e :
	logger .debug (f"Audio encoding skipped: {e }")

	if self .cross_attention_layers is not None :
	try :
	text_embeds =self ._apply_cross_attention (
	text_embeds ,
	image_embeds =image_embeds_for_cross ,
	video_embeds =video_embeds_for_cross ,
	audio_embeds =audio_embeds_for_cross ,
	)
	text_embeds =safe_clamp_tensor (text_embeds )
	except Exception as e :
	logger .debug (f"Cross-attention skipped: {e }")

	text_embeds =safe_clamp_tensor (text_embeds )

	outputs =self .llm (inputs_embeds =text_embeds ,attention_mask =attention_mask ,labels =labels )

	return MultimodalModelOutput (
	loss =outputs .loss if hasattr (outputs ,'loss')else None ,
	logits =outputs .logits if hasattr (outputs ,'logits')else None ,
	aux_loss =outputs .aux_loss if hasattr (outputs ,'aux_loss')else None ,
	)

	@torch .no_grad ()
	def generate_image (self ,input_ids :torch .Tensor ,attention_mask :torch .Tensor =None ):
	"""Generate image from text."""
	if self .generator is None :
	raise ValueError ("Image generator not enabled")
	context =self .get_text_embeddings (input_ids ,attention_mask )
	images =self .generator .generate (context )
	return images

	@torch .no_grad ()
	def generate_video (self ,input_ids :torch .Tensor ,attention_mask :torch .Tensor =None ,
	first_frame :torch .Tensor =None ,num_frames :int =None ):
	"""Generate video from text (T2V) or from image (I2V)."""
	if self .video_generator is None :
	raise ValueError ("Video generator not enabled")

	context =self .get_text_embeddings (input_ids ,attention_mask )
	context =context .mean (dim =1 )

	if first_frame is not None :
	video =self .video_generator .generate_i2v (first_frame ,context ,num_frames )
	else :
	video =self .video_generator .generate_t2v (context ,num_frames )

	return video

	@torch .no_grad ()
	def generate_speech (self ,input_ids :torch .Tensor ,attention_mask :torch .Tensor =None ):
	"""Generate speech (mel-spectrogram) from text (TTS)."""
	text_embeds =self .get_text_embeddings (input_ids ,attention_mask )

	mel ,durations ,_ ,_ =self .audio_decoder (text_embeds )
	return mel ,durations

	@torch .no_grad ()
	def speak (
	self ,
	input_ids :torch .Tensor ,
	attention_mask :torch .Tensor =None ,
	speaker_embedding :torch .Tensor =None ,
	return_mel :bool =False ,
	)->torch .Tensor :
	"""
	Generate playable audio waveform from text (Speech-to-Speech TTS).

	This is the main method for making the model talk. It converts text
	directly to audio waveform without needing an external vocoder.

	Args:
	input_ids: [B, T] tokenized text input
	attention_mask: [B, T] attention mask
	speaker_embedding: [B, D] optional speaker embedding for voice cloning
	return_mel: If True, also return intermediate mel spectrogram

	Returns:
	waveform: [B, T_audio] raw audio waveform in [-1, 1] range at 16kHz
	Can be played directly or saved as WAV file
	mel (optional): [B, 80, T_mel] mel spectrogram if return_mel=True
	"""

	text_embeds =self .get_text_embeddings (input_ids ,attention_mask )




	mel ,durations ,_ ,_ =self .audio_decoder (
	text_embeds ,
	speaker_embedding =speaker_embedding ,
	)



	mel_features =mel .transpose (1 ,2 )


	if not hasattr (self ,'_mel_to_hidden'):
	self ._mel_to_hidden =nn .Linear (80 ,self .config .hidden_size ).to (mel .device )
	audio_features =self ._mel_to_hidden (mel_features )


	waveform =self .waveform_decoder (audio_features )

	if return_mel :
	return waveform ,mel
	return waveform

	@torch .no_grad ()
	def listen (self ,audio_waveform :torch .Tensor )->torch .Tensor :
	"""
	Transcribe audio to text embeddings (Speech-to-Speech ASR).

	This is the listening component - converts speech to embeddings
	that can be fed to the LLM for understanding.

	Args:
	audio_waveform: [B, T_audio] raw audio waveform

	Returns:
	audio_embeds: [B, T, hidden_size] encoded audio features
	"""
	return self .encode_audio (audio_waveform )

	@torch .no_grad ()
	def listen_and_respond (
	self ,
	audio_waveform :torch .Tensor ,
	tokenizer =None ,
	max_new_tokens :int =512 ,
	speaker_embedding :torch .Tensor =None ,
	temperature :float =0.7 ,
	top_p :float =0.9 ,
	tool_executor =None ,
	available_tools :list =None ,
	system_prompt :str =None ,
	max_tool_calls :int =5 ,
	) -> Dict [str ,Any ]:
	"""
	Agentic Speech-to-Speech: Listen, think, use tools, speak back.

	This is the full agentic pipeline for live voice conversations.
	The model can detect when the user is asking for actions (e.g.
	"write me a Python script") and execute tools mid-generation.

	Pipeline:
	1. Encode input audio → audio embeddings (ASR)
	2. Build context (system prompt with tools + audio embeddings)
	3. Generate tokens, watching for <\|tool_call\|> sequences
	4. When tool call detected: parse, execute, inject result, resume
	5. Synthesize final spoken response from non-tool text

	Args:
	audio_waveform: [B, T_audio] input audio waveform
	tokenizer: Tokenizer for decoding tokens to text (required for tools)
	max_new_tokens: Maximum total tokens to generate
	speaker_embedding: [B, D] optional speaker embedding for voice cloning
	temperature: Sampling temperature
	top_p: Nucleus sampling probability
	tool_executor: Callable(tool_name, args_dict) -> str result.
	If None, tool calls are detected but not executed.
	available_tools: List of tool definition dicts for system prompt.
	system_prompt: Optional system prompt override.
	max_tool_calls: Maximum number of tool calls per response (safety limit).

	Returns:
	Dict with:
	'waveform': [B, T_response] audio waveform tensor (in-memory, no file I/O)
	'text': str full response text (excluding tool call markup)
	'token_ids': [B, T_tokens] all generated token IDs
	'mel': [B, 80, T_mel] intermediate mel spectrogram
	'tool_calls': List[Dict] executed tool calls and their results
	'speaking_text': str clean text that was spoken (no tool markup)
	"""
	import re
	import json as _json

	device = audio_waveform .device
	batch_size = audio_waveform .shape [0 ]
	llm_device = self .get_llm_device ()

	# ── 1. Listen: encode input audio ──
	audio_embeds = self .encode_audio (audio_waveform )


	# Wrap with start/end markers
	audio_start = self .audio_start .expand (batch_size , -1 , -1 ).to (llm_device )
	audio_end = self .audio_end .expand (batch_size , -1 , -1 ).to (llm_device )
	audio_embeds = audio_embeds .to (llm_device )

	# ── 2. Build context with system prompt + tools ──
	context_parts = []

	if tokenizer is not None and (system_prompt or tool_executor):
	sys_text = system_prompt or "You are Xoron, an intelligent voice assistant. You can use tools to help the user."
	if tool_executor and hasattr (tool_executor , 'get_tool_prompt' ):
	sys_text = sys_text + "\n\n" + tool_executor .get_tool_prompt ()
	elif available_tools :
	from utils .tool_executor import format_tools_for_prompt
	sys_text = sys_text + "\n\n" + format_tools_for_prompt (available_tools )

	# Encode system prompt and prepend
	sys_str = "<\|system\|>" + sys_text + "<\|/system\|>"
	sys_token_ids = tokenizer .encode (sys_str , return_tensors ="pt" ).to (llm_device )
	sys_embeds = self .llm .model .embed_tokens (sys_token_ids )
	context_parts .append (sys_embeds .squeeze (0 ) if sys_embeds .dim () == 3 else sys_embeds )

	# Audio context
	context_parts .extend ([audio_start , audio_embeds , audio_end ])

	# Assistant generation prompt
	if tokenizer is not None :
	asst_str = "<\|assistant\|>"
	asst_ids = tokenizer .encode (asst_str , return_tensors ="pt" ).to (llm_device )
	asst_embeds = self .llm .model .embed_tokens (asst_ids )
	context_parts .append (asst_embeds .squeeze (0 ) if asst_embeds .dim () == 3 else asst_embeds )

	input_embeds = torch .cat (context_parts , dim =1 )

	# ── 3. Agentic generation loop with tool call detection ──
	tool_call_start_token = "<\|tool_call\|>"
	tool_call_end_token = "<\|/tool_call\|>"
	fn_name_start = "<\|function_name\|>"
	fn_name_end = "<\|/function_name\|>"
	fn_args_start = "<\|function_args\|>"
	fn_args_end = "<\|/function_args\|>"
	tool_result_start = "<\|tool_result\|>"
	tool_result_end = "<\|/tool_result\|>"
	eos_token = "<\|eos\|>"

	all_generated_ids = []
	tool_calls_made = []
	num_tool_calls = 0
	generated_text = ""
	total_tokens = 0

	# Use standard generation if no tool executor
	if tool_executor is None or tokenizer is None :
	gen_kwargs = {
	'inputs_embeds': input_embeds ,
	'max_new_tokens': max_new_tokens ,
	'do_sample': True ,
	'temperature': temperature ,
	'top_p': top_p ,
	'use_cache': True ,
	}
	generated_ids = self .llm .generate (**gen_kwargs )
	all_generated_ids = [generated_ids ]

	if tokenizer is not None :
	generated_text = tokenizer .batch_decode (generated_ids , skip_special_tokens =True )[0 ]
	else :
	# Token-by-token generation with tool call detection
	current_embeds = input_embeds
	past_key_values = None
	in_tool_call = False
	tool_call_buffer = ""

	while total_tokens < max_new_tokens :
	outputs = self .llm (
	inputs_embeds =current_embeds ,
	past_key_values =past_key_values ,
	use_cache =True ,
	)
	past_key_values = outputs .past_key_values
	logits = outputs .logits [:, -1 :, :]

	# Sample next token
	if temperature > 0 :
	logits = logits / temperature
	if top_p < 1.0 :
	sorted_logits , sorted_indices = torch .sort (logits , descending =True , dim =-1 )
	cumulative_probs = torch .cumsum (F .softmax (sorted_logits , dim =-1 ), dim =-1 )
	sorted_mask = cumulative_probs - F .softmax (sorted_logits , dim =-1 ) >= top_p
	sorted_logits [sorted_mask ] = float ('-inf' )
	logits .scatter_ (-1 , sorted_indices , sorted_logits )
	probs = F .softmax (logits , dim =-1 )
	next_token = torch .multinomial (probs .squeeze (1 ), num_samples =1 )
	else :
	next_token = logits .argmax (dim =-1 )

	total_tokens += 1
	all_generated_ids .append (next_token )

	# Decode the token
	token_text = tokenizer .decode (next_token [0 ], skip_special_tokens =False )
	generated_text = generated_text + token_text

	# Check for EOS
	if eos_token in token_text or next_token .item () == tokenizer .eos_token_id :
	break

	# ── Tool call detection ──
	if tool_call_start_token in generated_text and not in_tool_call :
	in_tool_call = True
	# Extract everything after the tool_call_start
	tc_start_idx = generated_text .rfind (tool_call_start_token )
	tool_call_buffer = generated_text [tc_start_idx :]

	if in_tool_call :
	tool_call_buffer = tool_call_buffer + token_text if tool_call_buffer else generated_text

	# Check if we have a complete tool call
	if tool_call_end_token in tool_call_buffer :
	in_tool_call = False
	num_tool_calls += 1

	# Parse the tool call
	tool_name = ""
	tool_args = {}
	try :
	# Extract function name
	name_start = tool_call_buffer .find (fn_name_start ) + len (fn_name_start )
	name_end = tool_call_buffer .find (fn_name_end )
	if name_start > 0 and name_end > 0 :
	tool_name = tool_call_buffer [name_start :name_end ].strip ()

	# Extract arguments
	args_start = tool_call_buffer .find (fn_args_start ) + len (fn_args_start )
	args_end = tool_call_buffer .find (fn_args_end )
	if args_start > 0 and args_end > 0 :
	args_str = tool_call_buffer [args_start :args_end ].strip ()
	try :
	import json as _json
	tool_args = _json .loads (args_str )
	except Exception :
	tool_args = {"raw": args_str }
	except Exception :
	pass

	# Execute the tool
	tool_result = "[error]: Failed to parse tool call"
	if tool_name :
	tool_result = tool_executor (tool_name , tool_args )

	tool_calls_made .append ({
	"name": tool_name ,
	"arguments": tool_args ,
	"result": tool_result ,
	})

	# Inject tool result back into generation context
	result_str = tool_result_start + tool_result + tool_result_end
	result_ids = tokenizer .encode (result_str , return_tensors ="pt" ).to (llm_device )
	result_embeds = self .llm .model .embed_tokens (result_ids )
	current_embeds = result_embeds
	past_key_values = None # Reset KV cache to include result
	all_generated_ids .append (result_ids .squeeze (0 ))

	generated_text = generated_text + result_str
	tool_call_buffer = ""

	if num_tool_calls >= max_tool_calls :
	break

	continue

	# Prepare next input
	next_embeds = self .llm .model .embed_tokens (next_token )
	current_embeds = next_embeds

	# Combine all generated IDs
	if all_generated_ids :
	flat_ids = []
	for t in all_generated_ids :
	if t .dim () == 0 :
	flat_ids .append (t .unsqueeze (0 ))
	elif t .dim () == 1 :
	flat_ids .append (t )
	else :
	flat_ids .append (t .view (-1 ))
	generated_ids = torch .cat (flat_ids , dim =0 ).unsqueeze (0 )
	else :
	generated_ids = torch .tensor ([[]], dtype =torch .long , device =llm_device )

	# ── 4. Extract speaking text (strip tool call/result markup) ──
	speaking_text = generated_text
	# Remove tool call blocks
	while tool_call_start_token in speaking_text :
	tc_s = speaking_text .find (tool_call_start_token )
	tc_e = speaking_text .find (tool_call_end_token )
	if tc_e > tc_s :
	speaking_text = speaking_text [:tc_s ] + speaking_text [tc_e + len (tool_call_end_token ):]
	else :
	break
	# Remove tool result blocks
	while tool_result_start in speaking_text :
	tr_s = speaking_text .find (tool_result_start )
	tr_e = speaking_text .find (tool_result_end )
	if tr_e > tr_s :
	speaking_text = speaking_text [:tr_s ] + speaking_text [tr_e + len (tool_result_end ):]
	else :
	break
	speaking_text = speaking_text .strip ()

	# ── 5. Speak: encode → mel → stream_decode → waveform ──
	response_embeds = self .llm .model .embed_tokens (generated_ids .to (llm_device ))

	mel , durations , _ , _ = self .audio_decoder (
	response_embeds ,
	speaker_embedding =speaker_embedding ,
	)

	mel_features = mel .transpose (1 , 2 )
	if not hasattr (self , '_mel_to_hidden' ):
	self ._mel_to_hidden = nn .Linear (80 , self .config .hidden_size ).to (mel .device )
	audio_features = self ._mel_to_hidden (mel_features )

	waveform = self .waveform_decoder .stream_decode (audio_features )

	return {
	'waveform': waveform ,
	'text': generated_text ,
	'speaking_text': speaking_text ,
	'token_ids': generated_ids ,
	'mel': mel ,
	'tool_calls': tool_calls_made ,
	}


	def merge_lora_weights (self ):
	"""Merge LoRA weights into main weights for inference."""
	if not self .lora_applied :
	return
	for module in self .modules ():
	if isinstance (module ,LoRALinear ):
	module .merge_lora_weights ()
	logger .info ("LoRA weights merged into base model")

	def unmerge_lora_weights (self ):
	"""Unmerge LoRA weights for continued training."""
	if not self .lora_applied :
	return
	for module in self .modules ():
	if isinstance (module ,LoRALinear ):
	module .unmerge_lora_weights ()
	logger .info ("LoRA weights unmerged")

	def save_pretrained (
	self ,
	path :str ,
	optimizer =None ,
	scheduler =None ,
	global_step :int =0 ,
	epoch :int =0 ,
	best_loss :float =float ('inf'),
	sharded :bool =False ,
	max_shard_size :int =2 1024 1024 *1024 ,
	save_separately :bool =True ,
	):
	"""
	Save model and optionally training state for resuming.

	Args:
	path: Directory to save the model
	optimizer: Optional optimizer to save state
	scheduler: Optional scheduler to save state
	global_step: Current training step
	epoch: Current epoch
	best_loss: Best loss achieved so far
	sharded: If True, save model in multiple .safetensors files
	max_shard_size: Maximum size per shard in bytes (default 2GB)
	save_separately: If True, save each component as separate .safetensors files (default)
	This avoids safetensors issues with shared storage in LSTM weights
	"""
	os .makedirs (path ,exist_ok =True )

	if save_separately :

	self ._save_components_safe (path )
	elif sharded :
	self ._save_sharded (path ,max_shard_size )
	else :

	self ._save_single_file_safe (path )

	config_dict =self .config .to_dict ()

	config_dict ['has_audio_encoder']=True
	config_dict ['has_audio_decoder']=True
	config_dict ['has_waveform_decoder']=hasattr (self ,'waveform_decoder')and self .waveform_decoder is not None
	config_dict ['has_vision_encoder']=hasattr (self ,'vision_encoder')and self .vision_encoder is not None
	config_dict ['has_video_encoder']=hasattr (self ,'video_encoder')and self .video_encoder is not None
	config_dict ['has_generator']=hasattr (self ,'generator')and self .generator is not None
	config_dict ['has_video_generator']=hasattr (self ,'video_generator')and self .video_generator is not None
	config_dict ['has_cross_attention']=hasattr (self ,'cross_attention_layers')and self .cross_attention_layers is not None
	config_dict ['lora_applied']=self .lora_applied
	config_dict ['architecture_version']=2


	config_dict ['auto_map']={
	'AutoConfig':'configuration_xoron.XoronConfig',
	'AutoModel':'modeling_xoron.XoronModel',
	'AutoModelForCausalLM':'modeling_xoron.XoronForCausalLM',
	}

	with open (os .path .join (path ,"config.json"),"w")as f :
	json .dump (config_dict ,f ,indent =2 )


	self ._copy_huggingface_files (path )

	if optimizer is not None or scheduler is not None :
	training_state ={
	'global_step':global_step ,
	'epoch':epoch ,
	'best_loss':best_loss ,
	}
	if optimizer is not None :
	training_state ['optimizer_state_dict']=optimizer .state_dict ()
	if scheduler is not None :
	training_state ['scheduler_state_dict']=scheduler .state_dict ()

	torch .save (training_state ,os .path .join (path ,"training_state.pt"))
	logger .info (f"Training state saved (step {global_step }, epoch {epoch })")

	logger .info (f"Model saved to {path }")

	def _copy_huggingface_files (self ,path :str ):
	"""
	Build and copy HuggingFace custom code files for trust_remote_code support.

	This DYNAMICALLY BUILDS a self-contained modeling_xoron.py by combining
	all model components, so users can load from HuggingFace Hub with:
	model = AutoModel.from_pretrained("repo/model", trust_remote_code=True)

	WITHOUT needing to install the full Xoron-Dev package.

	Args:
	path: Directory to save the files
	"""
	import shutil


	current_dir =os .path .dirname (os .path .abspath (__file__ ))
	project_root =os .path .dirname (current_dir )


	config_src =os .path .join (project_root ,'configuration_xoron.py')
	config_dst =os .path .join (path ,'configuration_xoron.py')
	if os .path .exists (config_src ):
	shutil .copy2 (config_src ,config_dst )
	logger .info ("Copied configuration_xoron.py")


	modeling_dst =os .path .join (path ,'modeling_xoron.py')
	self ._build_self_contained_modeling_file (project_root ,modeling_dst )

	logger .info ("HuggingFace custom code files ready")

	def _build_self_contained_modeling_file (self ,project_root :str ,output_path :str ):
	"""
	Build a self-contained modeling_xoron.py by combining all model components.

	This creates a single file with ALL model code embedded, removing internal
	imports so it works standalone on HuggingFace without the full package.
	"""
	import re


	component_files =[
	"models/components/lora.py",
	"models/components/attention.py",
	"models/components/projectors.py",
	"models/components/moe.py",
	"models/encoders/vision.py",
	"models/encoders/video.py",
	"models/encoders/audio.py",
	"models/generators/image.py",
	"models/generators/video.py",
	"models/llm/moe_llama.py",
	"models/xoron.py",
	]


	internal_import_patterns =[
	r"^from config import.*$",
	r"^from config\..import.$",
	r"^from models\..import.$",
	r"^from models import.*$",
	]

	def is_internal_import (line ):
	line =line .strip ()
	for pattern in internal_import_patterns :
	if re .match (pattern ,line ):
	return True
	return False

	def is_module_level_import (line ):
	"""Check if this is a module-level import (no indentation)."""
	stripped =line .strip ()



	if line and not line [0 ].isspace ():
	return (stripped .startswith ("import ")or stripped .startswith ("from "))
	return False

	def extract_code_body (content ):
	"""Extract code body, removing module docstring and module-level imports only."""
	lines =content .split ('\n')
	code_lines =[]
	i =0
	in_multiline_import =False


	while i <len (lines )and not lines [i ].strip ():
	i +=1


	if i <len (lines ):
	stripped =lines [i ].strip ()
	if stripped .startswith ('"""')or stripped .startswith ("'''"):
	docstring_char =stripped [:3 ]
	if stripped .count (docstring_char )>=2 :
	i +=1
	else :
	i +=1
	while i <len (lines ):
	if docstring_char in lines [i ]:
	i +=1
	break
	i +=1


	for line in lines [i :]:
	stripped =line .strip ()


	if not code_lines and not stripped :
	continue


	if in_multiline_import :
	if ')'in stripped :
	in_multiline_import =False
	continue



	if is_module_level_import (line ):

	if '('in stripped and ')'not in stripped :
	in_multiline_import =True
	continue


	if stripped .startswith ("logger = logging.getLogger")and not line [0 ].isspace ():
	continue

	code_lines .append (line )


	while code_lines and not code_lines [-1 ].strip ():
	code_lines .pop ()

	return '\n'.join (code_lines )


	header ='''"""
	Xoron Model for HuggingFace Transformers - Self-Contained Implementation.

	AUTO-GENERATED FILE - Do not edit directly!

	This module provides a complete, self-contained HuggingFace-compatible model class
	for the Xoron multimodal model. All components are embedded directly in this file
	to enable loading via AutoModel with trust_remote_code=True WITHOUT requiring
	the full Xoron-Dev package to be installed.

	Usage:
	from transformers import AutoModel, AutoConfig
	config = AutoConfig.from_pretrained("your-repo/xoron-model", trust_remote_code=True)
	model = AutoModel.from_pretrained("your-repo/xoron-model", trust_remote_code=True)
	"""



	try:
	from safetensors.torch import save_file, load_file
	except ImportError:
	save_file, load_file = None, None


	try:
	from transformers.models.llama.modeling_llama import (
	LlamaAttention, LlamaDecoderLayer, LlamaRMSNorm, LlamaMLP,
	LlamaRotaryEmbedding, apply_rotary_pos_emb, repeat_kv
	)
	except ImportError:
	LlamaAttention = LlamaDecoderLayer = LlamaRMSNorm = LlamaMLP = None
	LlamaRotaryEmbedding = apply_rotary_pos_emb = repeat_kv = None


	try:
	from .configuration_xoron import XoronConfig
	except ImportError:
	from configuration_xoron import XoronConfig


	'''

	all_code =[header ]


	for filepath in component_files :
	full_path =os .path .join (project_root ,filepath )
	if not os .path .exists (full_path ):
	logger .warning (f"Component not found: {filepath }")
	continue

	with open (full_path ,'r',encoding ='utf-8')as f :
	content =f .read ()

	code =extract_code_body (content )
	if code .strip ():
	section_name =filepath .replace ('/','.').replace ('.py','').upper ()
	section_header = f"""\n\n {'='78}\n {section_name}\n {'='78}\n\n"""
	all_code .append (section_header +code )


	hf_wrapper ='''





	class XoronPreTrainedModel(PreTrainedModel):
	"""Base class for Xoron models providing HuggingFace integration."""

	config_class = XoronConfig
	base_model_prefix = "xoron"
	supports_gradient_checkpointing = True
	_no_split_modules = ["XoronMultimodalModel"]
	_skip_keys_device_placement = "past_key_values"
	_supports_flash_attn_2 = True

	def _init_weights(self, module):
	std = 0.02
	if isinstance(module, nn.Linear):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()
	elif isinstance(module, nn.LayerNorm):
	module.bias.data.zero_()
	module.weight.data.fill_(1.0)


	class XoronModel(XoronPreTrainedModel):
	"""Xoron Multimodal Model for HuggingFace."""

	def __init__(self, config: XoronConfig):
	super().__init__(config)
	self.config = config

	self._internal_model = None
	self._model_initialized = False

	def _ensure_model_initialized(self):
	"""Lazily initialize the internal model to avoid meta device conflicts."""
	if not self._model_initialized:
	self._internal_model = XoronMultimodalModel(self.config)
	self._model_initialized = True

	@property
	def internal_model(self):
	self._ensure_model_initialized()
	return self._internal_model

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, model_args, *kwargs):
	"""
	Load pretrained Xoron model from HuggingFace Hub or local path.

	This override ensures proper initialization without meta device conflicts.
	"""

	kwargs.pop('device_map', None)


	config = kwargs.pop('config', None)
	if config is None:
	config = XoronConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)


	model = cls(config)


	model._internal_model = XoronMultimodalModel(config)
	model._model_initialized = True


	import os
	from safetensors import safe_open


	if os.path.isdir(pretrained_model_name_or_path):
	model_path = pretrained_model_name_or_path
	else:
	from huggingface_hub import snapshot_download
	model_path = snapshot_download(repo_id=pretrained_model_name_or_path)


	components_json = os.path.join(model_path, "components.json")
	if os.path.exists(components_json):
	with open(components_json, 'r') as f:
	manifest = json.load(f)

	component_map = {
	'llm': model._internal_model.llm,
	'vision_encoder': model._internal_model.vision_encoder,
	'video_encoder': model._internal_model.video_encoder,
	'audio_encoder': model._internal_model.audio_encoder,
	'audio_decoder': model._internal_model.audio_decoder,
	'projector': model._internal_model.projector,
	'audio_projector': model._internal_model.audio_projector,
	}

	if model._internal_model.cross_attention_layers is not None:
	component_map['cross_attention'] = model._internal_model.cross_attention_layers
	if model._internal_model.generator is not None:
	component_map['generator'] = model._internal_model.generator
	if model._internal_model.video_generator is not None:
	component_map['video_generator'] = model._internal_model.video_generator
	if hasattr(model._internal_model, 'waveform_decoder') and model._internal_model.waveform_decoder is not None:
	component_map['waveform_decoder'] = model._internal_model.waveform_decoder

	for comp_name in manifest.get('components', []):
	if comp_name == 'modality_markers':
	continue

	comp_path = os.path.join(model_path, f"{comp_name}.safetensors")
	if os.path.exists(comp_path) and comp_name in component_map:
	component = component_map[comp_name]
	if component is not None:
	with safe_open(comp_path, framework="pt") as f:
	state_dict = {k: f.get_tensor(k) for k in f.keys()}


	if comp_name == 'llm':
	embed_key = 'model.embed_tokens.weight'
	lm_head_key = 'lm_head.weight'

	if embed_key in state_dict:
	saved_vocab_size = state_dict[embed_key].shape[0]
	hidden_size = state_dict[embed_key].shape[1]
	current_vocab_size = component.model.embed_tokens.weight.shape[0]

	if saved_vocab_size != current_vocab_size:
	logger.info(f"Resizing embeddings: {current_vocab_size} -> {saved_vocab_size}")

	new_embed = nn.Embedding(saved_vocab_size, hidden_size)
	new_embed.weight.data = state_dict[embed_key]
	component.model.embed_tokens = new_embed


	if lm_head_key in state_dict:
	new_lm_head = nn.Linear(hidden_size, saved_vocab_size, bias=False)
	new_lm_head.weight.data = state_dict[lm_head_key]
	component.lm_head = new_lm_head


	del state_dict[embed_key]
	if lm_head_key in state_dict:
	del state_dict[lm_head_key]

	component.load_state_dict(state_dict, strict=False)
	logger.info(f"Loaded {comp_name}")


	markers_path = os.path.join(model_path, "modality_markers.safetensors")
	if os.path.exists(markers_path):
	with safe_open(markers_path, framework="pt") as f:
	model._internal_model.image_start.data = f.get_tensor('image_start')
	model._internal_model.image_end.data = f.get_tensor('image_end')
	model._internal_model.video_start.data = f.get_tensor('video_start')
	model._internal_model.video_end.data = f.get_tensor('video_end')
	model._internal_model.audio_start.data = f.get_tensor('audio_start')
	model._internal_model.audio_end.data = f.get_tensor('audio_end')
	logger.info("Loaded modality markers")

	logger.info(f"Xoron model loaded from {pretrained_model_name_or_path}")
	return model

	def forward(
	self,
	input_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	pixel_values: Optional[torch.Tensor] = None,
	video_frames: Optional[torch.Tensor] = None,
	audio_features: Optional[torch.Tensor] = None,
	labels: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	**kwargs,
	) -> Union[Tuple, CausalLMOutputWithPast]:
	self._ensure_model_initialized()
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	outputs = self._internal_model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	images=pixel_values,
	video=video_frames,
	audio=audio_features,
	labels=labels,
	)

	if return_dict:
	return CausalLMOutputWithPast(
	loss=outputs.get("loss"),
	logits=outputs.get("logits"),
	past_key_values=outputs.get("past_key_values"),
	hidden_states=outputs.get("hidden_states"),
	attentions=outputs.get("attentions"),
	)

	return (outputs.get("loss"), outputs.get("logits"))

	def generate_image(self, prompt_embeds: torch.Tensor, **kwargs):
	self._ensure_model_initialized()
	return self._internal_model.generate_image(prompt_embeds, **kwargs)

	def generate_video(self, prompt_embeds: torch.Tensor, **kwargs):
	self._ensure_model_initialized()
	return self._internal_model.generate_video(prompt_embeds, **kwargs)

	def generate_speech(self, text_embeds: torch.Tensor, **kwargs):
	self._ensure_model_initialized()
	return self._internal_model.generate_speech(text_embeds, **kwargs)


	class XoronForCausalLM(XoronModel):
	"""Alias for XoronModel for compatibility."""
	pass



	XoronConfig.register_for_auto_class()
	XoronModel.register_for_auto_class("AutoModel")
	XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
	'''
	all_code .append (hf_wrapper )


	final_content ='\n'.join (all_code )
	with open (output_path ,'w',encoding ='utf-8')as f :
	f .write (final_content )

	line_count =final_content .count ('\n')
	logger .info (f"Built self-contained modeling_xoron.py ({line_count :,} lines)")

	def _save_single_file_safe (self ,path :str ):
	"""
	Save model as single safetensors file with cloned tensors.
	Cloning breaks shared storage that causes safetensors errors.

	Args:
	path: Directory to save the model
	"""
	from safetensors .torch import save_file

	state_dict =self .state_dict ()



	safe_state_dict ={}
	for key ,tensor in state_dict .items ():
	safe_state_dict [key ]=tensor .clone ().contiguous ()

	save_file (safe_state_dict ,os .path .join (path ,"model.safetensors"))
	size_mb =sum (t .numel ()t .element_size ()for t in safe_state_dict .values ())/(1024 1024 )
	logger .info (f"Saved model.safetensors ({size_mb :.1f} MB)")

	def _save_components_safe (self ,path :str ):
	"""
	Save model components as separate .safetensors files with cloned tensors.
	This is the default and most robust saving method that:
	1. Handles LSTM weight sharing issues in safetensors
	2. Allows surgical component loading/updates
	3. Better for debugging and inspection

	Args:
	path: Directory to save component files
	"""
	from safetensors .torch import save_file

	os .makedirs (path ,exist_ok =True )

	component_map ={
	'llm':self .llm ,
	'vision_encoder':self .vision_encoder ,
	'video_encoder':self .video_encoder ,
	'audio_encoder':self .audio_encoder ,
	'audio_decoder':self .audio_decoder ,
	'projector':self .projector ,
	'audio_projector':self .audio_projector ,
	}

	if self .cross_attention_layers is not None :
	component_map ['cross_attention']=self .cross_attention_layers
	if self .generator is not None :
	component_map ['generator']=self .generator
	if self .video_generator is not None :
	component_map ['video_generator']=self .video_generator
	if hasattr (self ,'waveform_decoder')and self .waveform_decoder is not None :
	component_map ['waveform_decoder']=self .waveform_decoder

	saved_files =[]
	total_size =0

	for comp_name ,component in component_map .items ():
	if component is None :
	continue

	comp_state =component .state_dict ()
	if not comp_state :
	continue


	safe_comp_state ={}
	for key ,tensor in comp_state .items ():
	safe_comp_state [key ]=tensor .clone ().contiguous ()

	comp_path =os .path .join (path ,f"{comp_name }.safetensors")
	save_file (safe_comp_state ,comp_path )

	size_mb =sum (t .numel ()t .element_size ()for t in safe_comp_state .values ())/(1024 1024 )
	total_size +=size_mb
	logger .info (f"Saved {comp_name }: {size_mb :.1f} MB")
	saved_files .append (comp_name )


	markers ={
	'image_start':self .image_start .data .clone ().contiguous (),
	'image_end':self .image_end .data .clone ().contiguous (),
	'video_start':self .video_start .data .clone ().contiguous (),
	'video_end':self .video_end .data .clone ().contiguous (),
	'audio_start':self .audio_start .data .clone ().contiguous (),
	'audio_end':self .audio_end .data .clone ().contiguous (),
	}
	save_file (markers ,os .path .join (path ,"modality_markers.safetensors"))
	logger .info ("Saved modality_markers")


	manifest ={
	"components":saved_files +["modality_markers"],
	"save_format":"components",
	}
	with open (os .path .join (path ,"components.json"),"w")as f :
	json .dump (manifest ,f ,indent =2 )



	weight_map ={}
	total_bytes =0

	for comp_name ,component in component_map .items ():
	if component is None :
	continue
	comp_state =component .state_dict ()
	if not comp_state :
	continue
	safetensor_file =f"{comp_name }.safetensors"
	for key in comp_state .keys ():

	full_key =f"{comp_name }.{key }"
	weight_map [full_key ]=safetensor_file
	total_bytes +=comp_state [key ].numel ()*comp_state [key ].element_size ()


	marker_names =['image_start','image_end','video_start','video_end','audio_start','audio_end']
	for marker_name in marker_names :
	weight_map [marker_name ]="modality_markers.safetensors"
	marker_tensor =getattr (self ,marker_name )
	total_bytes +=marker_tensor .numel ()*marker_tensor .element_size ()

	index ={
	"metadata":{
	"total_size":total_bytes ,
	"format":"components",
	},
	"weight_map":weight_map ,
	}

	index_path =os .path .join (path ,"model.safetensors.index.json")
	with open (index_path ,"w")as f :
	json .dump (index ,f ,indent =2 )

	logger .info ("Saved model.safetensors.index.json for HuggingFace compatibility")
	logger .info (f"Total size: {total_size :.1f} MB across {len (saved_files )} components")

	def _save_sharded (self ,path :str ,max_shard_size :int ):
	"""
	Save model weights in sharded .safetensors files.
	Components are surgically split across shards.

	Args:
	path: Directory to save shards
	max_shard_size: Maximum bytes per shard
	"""
	from safetensors .torch import save_file

	state_dict =self .state_dict ()



	component_groups ={
	'llm':{},
	'vision_encoder':{},
	'video_encoder':{},
	'audio_encoder':{},
	'audio_decoder':{},
	'waveform_decoder':{},
	'generator':{},
	'video_generator':{},
	'projector':{},
	'audio_projector':{},
	'cross_attention_layers':{},
	'other':{},
	}

	for key ,tensor in state_dict .items ():
	placed =False
	for comp_name in component_groups .keys ():
	if comp_name !='other'and key .startswith (comp_name ):
	component_groups [comp_name ][key ]=tensor
	placed =True
	break
	if not placed :
	component_groups ['other'][key ]=tensor


	shards =[]
	current_shard ={}
	current_size =0
	shard_index_map ={}

	for comp_name ,comp_tensors in component_groups .items ():
	for key ,tensor in comp_tensors .items ():
	tensor_size =tensor .numel ()*tensor .element_size ()

	if current_size +tensor_size >max_shard_size and current_shard :
	shards .append (current_shard )
	current_shard ={}
	current_size =0

	current_shard [key ]=tensor
	current_size +=tensor_size

	if current_shard :
	shards .append (current_shard )


	total_shards =len (shards )
	weight_map ={}

	for i ,shard in enumerate (shards ):
	shard_name =f"model-{i +1 :05d}-of-{total_shards :05d}.safetensors"
	shard_path =os .path .join (path ,shard_name )


	shard_contiguous ={k :v .clone ().contiguous ()for k ,v in shard .items ()}
	save_file (shard_contiguous ,shard_path )

	for key in shard .keys ():
	weight_map [key ]=shard_name

	shard_size_mb =sum (t .numel ()t .element_size ()for t in shard .values ())/(1024 1024 )
	logger .info (f"Saved shard {i +1 }/{total_shards }: {shard_name } ({shard_size_mb :.1f} MB)")


	index ={
	"metadata":{
	"total_size":sum (t .numel ()*t .element_size ()for t in state_dict .values ()),
	"total_shards":total_shards ,
	},
	"weight_map":weight_map ,
	}

	index_path =os .path .join (path ,"model.safetensors.index.json")
	with open (index_path ,"w")as f :
	json .dump (index ,f ,indent =2 )

	logger .info ("Saved index: model.safetensors.index.json")

	def save_components_separately (self ,path :str ):
	"""
	Save model components as separate .safetensors files.
	Useful for surgical component updates and debugging.

	NOTE: This method now clones tensors to handle LSTM shared storage issues.

	Args:
	path: Directory to save component files
	"""
	from safetensors .torch import save_file

	os .makedirs (path ,exist_ok =True )

	component_map ={
	'llm':self .llm ,
	'vision_encoder':self .vision_encoder ,
	'video_encoder':self .video_encoder ,
	'audio_encoder':self .audio_encoder ,
	'audio_decoder':self .audio_decoder ,
	'projector':self .projector ,
	'audio_projector':self .audio_projector ,
	}

	if self .cross_attention_layers is not None :
	component_map ['cross_attention']=self .cross_attention_layers
	if self .generator is not None :
	component_map ['generator']=self .generator
	if self .video_generator is not None :
	component_map ['video_generator']=self .video_generator
	if hasattr (self ,'waveform_decoder')and self .waveform_decoder is not None :
	component_map ['waveform_decoder']=self .waveform_decoder

	saved_files =[]

	for comp_name ,component in component_map .items ():
	if component is None :
	continue

	comp_state =component .state_dict ()
	if not comp_state :
	continue


	comp_state ={k :v .clone ().contiguous ()for k ,v in comp_state .items ()}

	comp_path =os .path .join (path ,f"{comp_name }.safetensors")
	save_file (comp_state ,comp_path )

	size_mb =sum (t .numel ()t .element_size ()for t in comp_state .values ())/(1024 1024 )
	logger .info (f"Saved {comp_name }: {size_mb :.1f} MB")
	saved_files .append (comp_name )


	markers ={
	'image_start':self .image_start .data .clone ().contiguous (),
	'image_end':self .image_end .data .clone ().contiguous (),
	'video_start':self .video_start .data .clone ().contiguous (),
	'video_end':self .video_end .data .clone ().contiguous (),
	'audio_start':self .audio_start .data .clone ().contiguous (),
	'audio_end':self .audio_end .data .clone ().contiguous (),
	}
	save_file (markers ,os .path .join (path ,"modality_markers.safetensors"))
	logger .info ("Saved modality_markers")


	manifest ={
	"components":saved_files +["modality_markers"],
	"config":self .config .to_dict (),
	"lora_applied":self .lora_applied ,
	}
	with open (os .path .join (path ,"components.json"),"w")as f :
	json .dump (manifest ,f ,indent =2 )



	weight_map ={}
	total_bytes =0

	for comp_name ,component in component_map .items ():
	if component is None :
	continue
	comp_state =component .state_dict ()
	if not comp_state :
	continue
	safetensor_file =f"{comp_name }.safetensors"
	for key in comp_state .keys ():

	full_key =f"{comp_name }.{key }"
	weight_map [full_key ]=safetensor_file
	total_bytes +=comp_state [key ].numel ()*comp_state [key ].element_size ()


	marker_names =['image_start','image_end','video_start','video_end','audio_start','audio_end']
	for marker_name in marker_names :
	weight_map [marker_name ]="modality_markers.safetensors"
	marker_tensor =getattr (self ,marker_name )
	total_bytes +=marker_tensor .numel ()*marker_tensor .element_size ()

	index ={
	"metadata":{
	"total_size":total_bytes ,
	"format":"components",
	},
	"weight_map":weight_map ,
	}

	index_path =os .path .join (path ,"model.safetensors.index.json")
	with open (index_path ,"w")as f :
	json .dump (index ,f ,indent =2 )

	logger .info ("Saved model.safetensors.index.json for HuggingFace compatibility")
	logger .info (f"Components saved to {path }")

	@classmethod
	def from_pretrained (
	cls ,
	path :str ,
	device :str =None ,
	device_map :Dict [str ,str ]=None ,
	apply_lora :bool =True ,
	strict :bool =False ,
	)->'XoronMultimodalModel':
	"""
	Load a pretrained Xoron model from a checkpoint or final model directory.

	Args:
	path: Path to the saved model directory
	device: Device to load the model to (if not using device_map)
	device_map: Device map for model parallelism
	apply_lora: Whether to apply LoRA after loading
	strict: If False, allows loading weights even if architecture changed

	Returns:
	Loaded XoronMultimodalModel instance
	"""
	from safetensors import safe_open

	logger .info (f"Loading model from {path }...")


	config_path =os .path .join (path ,"config.json")
	if not os .path .exists (config_path ):
	raise FileNotFoundError (f"Config file not found at {config_path }")

	with open (config_path ,'r')as f :
	config_dict =json .load (f )


	lora_was_applied =config_dict .pop ('lora_applied',False )


	architecture_version =config_dict .pop ('architecture_version',1 )
	has_waveform_decoder =config_dict .pop ('has_waveform_decoder',False )
	has_vision_encoder =config_dict .pop ('has_vision_encoder',True )
	has_video_encoder =config_dict .pop ('has_video_encoder',True )
	has_generator =config_dict .pop ('has_generator',True )
	has_video_generator =config_dict .pop ('has_video_generator',True )
	has_cross_attention =config_dict .pop ('has_cross_attention',True )
	config_dict .pop ('has_audio_encoder',None )
	config_dict .pop ('has_audio_decoder',None )


	logger .info (f"Saved model architecture (version {architecture_version }):")
	logger .info (f" - Waveform Decoder: {'✅'if has_waveform_decoder else '❌ (will init randomly)'}")
	logger .info (f" - Vision Encoder: {'✅'if has_vision_encoder else '❌'}")
	logger .info (f" - Video Encoder: {'✅'if has_video_encoder else '❌'}")
	logger .info (f" - Image Generator: {'✅'if has_generator else '❌'}")
	logger .info (f" - Video Generator: {'✅'if has_video_generator else '❌'}")
	logger .info (f" - Cross Attention: {'✅'if has_cross_attention else '❌'}")
	logger .info (f" - LoRA Applied: {'✅'if lora_was_applied else '❌'}")

	config =XoronConfig .from_dict (config_dict )


	model =cls (config ,device_map =device_map )


	if lora_was_applied:
	logger .info ("Checkpoint has LoRA weights. Applying LoRA structure before loading...")
	model .apply_lora ()

	components_json =os .path .join (path ,"components.json")
	model_path =os .path .join (path ,"model.safetensors")

	if os .path .exists (components_json ):

	logger .info ("Loading from component-based format...")
	model ._load_components (path ,strict =strict )
	model .lora_applied =False # Always allow fresh LoRA application (checkpoint has merged weights)

	elif os .path .exists (model_path ):
	logger .info ("Loading weights from safetensors...")

	if strict :
	load_model (model ,model_path )
	else :
	checkpoint_state_dict ={}
	with safe_open (model_path ,framework ="pt",device ="cpu")as f :
	for key in f .keys ():
	checkpoint_state_dict [key ]=f .get_tensor (key )

	model .load_state_dict (checkpoint_state_dict ,strict =False )
	logger .info ("Loaded weights from checkpoint")

	model .lora_applied =False # Always allow fresh LoRA application (checkpoint has merged weights)
	else :

	pytorch_path =os .path .join (path ,"pytorch_model.bin")
	if os .path .exists (pytorch_path ):
	logger .info ("Loading weights from pytorch_model.bin...")
	checkpoint_state_dict =torch .load (pytorch_path ,map_location ='cpu')

	model .load_state_dict (checkpoint_state_dict ,strict =False )
	logger .info ("Loaded weights from checkpoint")

	model .lora_applied =False # Always allow fresh LoRA application (checkpoint has merged weights)
	else :
	raise FileNotFoundError (f"No model weights found at {path }")


	if apply_lora and config .use_lora and not model .lora_applied :
	model .apply_lora ()


	if device_map is not None :
	model .apply_model_parallel (device_map )
	elif device is not None :
	model =model .to (device )

	logger .info ("Model loaded successfully!")
	model ._print_stats ()

	return model

	def _load_components (self ,path :str ,strict :bool =False ):
	"""
	Load model from component-based safetensors files.

	Args:
	path: Directory containing component files
	strict: If True, require exact match; if False, allow partial loading
	"""
	from safetensors import safe_open


	component_map ={
	'llm':self .llm ,
	'vision_encoder':self .vision_encoder ,
	'video_encoder':self .video_encoder ,
	'audio_encoder':self .audio_encoder ,
	'audio_decoder':self .audio_decoder ,
	'projector':self .projector ,
	'audio_projector':self .audio_projector ,
	}

	if self .cross_attention_layers is not None :
	component_map ['cross_attention']=self .cross_attention_layers
	if self .generator is not None :
	component_map ['generator']=self .generator
	if self .video_generator is not None :
	component_map ['video_generator']=self .video_generator
	if hasattr (self ,'waveform_decoder')and self .waveform_decoder is not None :
	component_map ['waveform_decoder']=self .waveform_decoder

	for comp_name ,component in component_map .items ():
	if component is None :
	continue

	comp_path =os .path .join (path ,f"{comp_name }.safetensors")
	if not os .path .exists (comp_path ):
	continue

	try :
	checkpoint_state ={}
	with safe_open (comp_path ,framework ="pt",device ="cpu")as f :
	for key in f .keys ():
	checkpoint_state [key ]=f .get_tensor (key )

	component .load_state_dict (checkpoint_state ,strict =strict )

	size_mb =sum (t .numel ()t .element_size ()for t in checkpoint_state .values ())/(1024 1024 )
	logger .info (f"Loaded {comp_name } ({size_mb :.1f} MB)")

	except Exception as e :
	logger .warning (f"Error loading {comp_name }: {e }")


	markers_path =os .path .join (path ,"modality_markers.safetensors")
	if os .path .exists (markers_path ):
	try :
	with safe_open (markers_path ,framework ="pt",device ="cpu")as f :
	self .image_start .data =f .get_tensor ('image_start')
	self .image_end .data =f .get_tensor ('image_end')
	self .video_start .data =f .get_tensor ('video_start')
	self .video_end .data =f .get_tensor ('video_end')
	self .audio_start .data =f .get_tensor ('audio_start')
	self .audio_end .data =f .get_tensor ('audio_end')
	logger .info ("Loaded modality_markers")
	except Exception as e :
	logger .warning (f"Error loading modality_markers: {e }")

	logger .info ("Components loaded successfully")

	@staticmethod
	def load_training_state (path :str )->Optional [Dict ]:
	"""
	Load training state from a checkpoint.

	Args:
	path: Path to the checkpoint directory

	Returns:
	Dictionary with training state or None if not found
	"""
	state_path =os .path .join (path ,"training_state.pt")
	if os .path .exists (state_path ):
	logger .info (f"Loading training state from {state_path }...")
	return torch .load (state_path ,map_location ='cpu')
	return None

	def freeze_components (self ,components :List [str ],hard_freeze :bool =True ):
	"""
	Freeze specific components of the model.

	IMPORTANT RULES:
	1. LLM is NEVER frozen - it's trained from scratch and always needs full weight training
	2. LoRA parameters are usually kept trainable, UNLESS hard_freeze=True

	Args:
	components: List of component group names to freeze.
	Valid groups: 'vision', 'video', 'audio',
	'cross_attention', 'image_generation', 'video_generation',
	'modality_markers'

	NOTE: 'llm' is NOT a valid group to freeze - will be ignored!
	hard_freeze: If True, completely freezes the component including its LoRA adapters.
	This prevents inactive components from updating via weight decay/momentum.
	"""

	if 'llm'in components :
	logger .warning ("Ignoring 'llm' in freeze list - LLM must always train (from scratch)")
	components =[c for c in components if c !='llm']

	logger .info (f"Freezing components: {components } (hard_freeze={hard_freeze })")

	for group_name in components :
	if group_name not in COMPONENT_GROUPS :
	logger .warning (f" ⚠️ Unknown component group: {group_name }")
	continue

	for attr_name in COMPONENT_GROUPS [group_name ]:
	if hasattr (self ,attr_name ):
	component =getattr (self ,attr_name )
	if component is not None :
	if isinstance (component ,nn .Parameter ):
	component .requires_grad =False
	elif isinstance (component ,nn .Module ):
	for name ,param in component .named_parameters ():

	path_lora ='lora_A'in name or 'lora_B'in name or 'magnitude'in name
	if hard_freeze or not path_lora :
	param .requires_grad =False
	logger .info (f"Frozen: {attr_name }")


	if self .lora_applied and not hard_freeze:
	enable_lora_training (self )
	logger .info ("LoRA parameters remain trainable")

	self ._print_stats ()

	def unfreeze_components (self ,components :List [str ]):
	"""
	Unfreeze specific components of the model.

	Args:
	components: List of component group names to unfreeze.
	"""
	logger .info (f"Unfreezing components: {components }")

	for group_name in components :
	if group_name not in COMPONENT_GROUPS :
	logger .warning (f" ⚠️ Unknown component group: {group_name }")
	continue

	for attr_name in COMPONENT_GROUPS [group_name ]:
	if hasattr (self ,attr_name ):
	component =getattr (self ,attr_name )
	if component is not None :
	if isinstance (component ,nn .Parameter ):
	component .requires_grad =True
	elif isinstance (component ,nn .Module ):
	for param in component .parameters ():
	param .requires_grad =True
	logger .info (f"Unfrozen: {attr_name }")

	self ._print_stats ()

	def freeze_all_except (self ,components :List [str ],hard_freeze :bool =True ):
	"""
	Freeze all components except the specified ones.

	NOTE: LLM is always kept trainable regardless of input - it's trained from scratch.

	Args:
	components: List of component group names to keep trainable.
	"""

	if 'llm'not in components :
	components =components +['llm']

	all_groups =list (COMPONENT_GROUPS .keys ())
	groups_to_freeze =[g for g in all_groups if g not in components ]
	self .freeze_components (groups_to_freeze ,hard_freeze =hard_freeze )

	def get_trainable_component_names (self )->List [str ]:
	"""Get list of component groups that have trainable parameters."""
	trainable =[]
	for group_name ,attr_names in COMPONENT_GROUPS .items ():
	for attr_name in attr_names :
	if hasattr (self ,attr_name ):
	component =getattr (self ,attr_name )
	if component is not None :
	if isinstance (component ,nn .Parameter ):
	if component .requires_grad :
	trainable .append (group_name )
	break
	elif isinstance (component ,nn .Module ):
	if any (p .requires_grad for p in component .parameters ()):
	trainable .append (group_name )
	break
	return trainable

	def get_frozen_component_names (self )->List [str ]:
	"""Get list of component groups that are frozen (no trainable parameters)."""
	frozen =[]
	for group_name ,attr_names in COMPONENT_GROUPS .items ():
	has_component =False
	is_trainable =False
	for attr_name in attr_names :
	if hasattr (self ,attr_name ):
	component =getattr (self ,attr_name )
	if component is not None :
	has_component =True
	if isinstance (component ,nn .Parameter ):
	if component .requires_grad :
	is_trainable =True
	break
	elif isinstance (component ,nn .Module ):
	if any (p .requires_grad for p in component .parameters ()):
	is_trainable =True
	break

	if has_component and not is_trainable :
	frozen .append (group_name )
	return frozen

	def get_component_status (self )->tuple :
	"""
	Get tuple of (trainable_components, frozen_components) for display.

	Returns:
	tuple: (list of trainable component names, list of frozen component names)
	"""
	trainable =self .get_trainable_component_names ()
	frozen =self .get_frozen_component_names ()
	return trainable ,frozen






	class XoronPreTrainedModel(PreTrainedModel):
	"""Base class for Xoron models providing HuggingFace integration."""

	config_class = XoronConfig
	base_model_prefix = "xoron"
	supports_gradient_checkpointing = True
	_no_split_modules = ["XoronMultimodalModel"]
	_skip_keys_device_placement = "past_key_values"
	_supports_flash_attn_2 = True

	def _init_weights(self, module):
	std = 0.02
	if isinstance(module, nn.Linear):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()
	elif isinstance(module, nn.LayerNorm):
	module.bias.data.zero_()
	module.weight.data.fill_(1.0)


	class XoronModel(XoronPreTrainedModel):
	"""Xoron Multimodal Model for HuggingFace."""

	def __init__(self, config: XoronConfig):
	super().__init__(config)
	self.config = config

	self._internal_model = None
	self._model_initialized = False

	def _ensure_model_initialized(self):
	"""Lazily initialize the internal model to avoid meta device conflicts."""
	if not self._model_initialized:
	self._internal_model = XoronMultimodalModel(self.config)
	self._model_initialized = True

	@property
	def internal_model(self):
	self._ensure_model_initialized()
	return self._internal_model

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, model_args, *kwargs):
	"""
	Load pretrained Xoron model from HuggingFace Hub or local path.

	This override ensures proper initialization without meta device conflicts.
	"""

	kwargs.pop('device_map', None)


	config = kwargs.pop('config', None)
	if config is None:
	config = XoronConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)


	model = cls(config)


	model._internal_model = XoronMultimodalModel(config)
	model._model_initialized = True


	import os
	from safetensors import safe_open


	if os.path.isdir(pretrained_model_name_or_path):
	model_path = pretrained_model_name_or_path
	else:
	from huggingface_hub import snapshot_download
	model_path = snapshot_download(repo_id=pretrained_model_name_or_path)


	components_json = os.path.join(model_path, "components.json")
	if os.path.exists(components_json):
	with open(components_json, 'r') as f:
	manifest = json.load(f)

	component_map = {
	'llm': model._internal_model.llm,
	'vision_encoder': model._internal_model.vision_encoder,
	'video_encoder': model._internal_model.video_encoder,
	'audio_encoder': model._internal_model.audio_encoder,
	'audio_decoder': model._internal_model.audio_decoder,
	'projector': model._internal_model.projector,
	'audio_projector': model._internal_model.audio_projector,
	}

	if model._internal_model.cross_attention_layers is not None:
	component_map['cross_attention'] = model._internal_model.cross_attention_layers
	if model._internal_model.generator is not None:
	component_map['generator'] = model._internal_model.generator
	if model._internal_model.video_generator is not None:
	component_map['video_generator'] = model._internal_model.video_generator
	if hasattr(model._internal_model, 'waveform_decoder') and model._internal_model.waveform_decoder is not None:
	component_map['waveform_decoder'] = model._internal_model.waveform_decoder

	for comp_name in manifest.get('components', []):
	if comp_name == 'modality_markers':
	continue

	comp_path = os.path.join(model_path, f"{comp_name}.safetensors")
	if os.path.exists(comp_path) and comp_name in component_map:
	component = component_map[comp_name]
	if component is not None:
	with safe_open(comp_path, framework="pt") as f:
	state_dict = {k: f.get_tensor(k) for k in f.keys()}


	if comp_name == 'llm':
	embed_key = 'model.embed_tokens.weight'
	lm_head_key = 'lm_head.weight'

	if embed_key in state_dict:
	saved_vocab_size = state_dict[embed_key].shape[0]
	hidden_size = state_dict[embed_key].shape[1]
	current_vocab_size = component.model.embed_tokens.weight.shape[0]

	if saved_vocab_size != current_vocab_size:
	logger.info(f"Resizing embeddings: {current_vocab_size} -> {saved_vocab_size}")

	new_embed = nn.Embedding(saved_vocab_size, hidden_size)
	new_embed.weight.data = state_dict[embed_key]
	component.model.embed_tokens = new_embed


	if lm_head_key in state_dict:
	new_lm_head = nn.Linear(hidden_size, saved_vocab_size, bias=False)
	new_lm_head.weight.data = state_dict[lm_head_key]
	component.lm_head = new_lm_head


	del state_dict[embed_key]
	if lm_head_key in state_dict:
	del state_dict[lm_head_key]

	component.load_state_dict(state_dict, strict=False)
	logger.info(f"Loaded {comp_name}")


	markers_path = os.path.join(model_path, "modality_markers.safetensors")
	if os.path.exists(markers_path):
	with safe_open(markers_path, framework="pt") as f:
	model._internal_model.image_start.data = f.get_tensor('image_start')
	model._internal_model.image_end.data = f.get_tensor('image_end')
	model._internal_model.video_start.data = f.get_tensor('video_start')
	model._internal_model.video_end.data = f.get_tensor('video_end')
	model._internal_model.audio_start.data = f.get_tensor('audio_start')
	model._internal_model.audio_end.data = f.get_tensor('audio_end')
	logger.info("Loaded modality markers")

	logger.info(f"Xoron model loaded from {pretrained_model_name_or_path}")
	return model

	def forward(
	self,
	input_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	pixel_values: Optional[torch.Tensor] = None,
	video_frames: Optional[torch.Tensor] = None,
	audio_features: Optional[torch.Tensor] = None,
	labels: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	**kwargs,
	) -> Union[Tuple, CausalLMOutputWithPast]:
	self._ensure_model_initialized()
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	outputs = self._internal_model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	images=pixel_values,
	video=video_frames,
	audio=audio_features,
	labels=labels,
	)

	if return_dict:
	return CausalLMOutputWithPast(
	loss=outputs.get("loss"),
	logits=outputs.get("logits"),
	past_key_values=outputs.get("past_key_values"),
	hidden_states=outputs.get("hidden_states"),
	attentions=outputs.get("attentions"),
	)

	return (outputs.get("loss"), outputs.get("logits"))

	def generate_image(self, prompt_embeds: torch.Tensor, **kwargs):
	self._ensure_model_initialized()
	return self._internal_model.generate_image(prompt_embeds, **kwargs)

	def generate_video(self, prompt_embeds: torch.Tensor, **kwargs):
	self._ensure_model_initialized()
	return self._internal_model.generate_video(prompt_embeds, **kwargs)

	def generate_speech(self, text_embeds: torch.Tensor, **kwargs):
	self._ensure_model_initialized()
	return self._internal_model.generate_speech(text_embeds, **kwargs)


	class XoronForCausalLM(XoronModel):
	"""Alias for XoronModel for compatibility."""
	pass



	XoronConfig.register_for_auto_class()
	XoronModel.register_for_auto_class("AutoModel")
	XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")