| | """# `shared_space_config.py` |
| | |
| | #### `*Config` |
| | """ |
| |
|
| | from typing import Optional |
| |
|
| | import torch |
| | from torch import nn |
| |
|
| | from transformers.configuration_utils import PretrainedConfig |
| | from transformers.modeling_utils import PreTrainedModel |
| |
|
| | """`def make_shorthand`""" |
| |
|
| | def make_shorthand(model_cfg): |
| | """ |
| | Takes an instance subencoder `*Config` and constructs a shorthand |
| | name for the model based on settings. |
| | """ |
| |
|
| | dense_str = str(model_cfg.num_dense_layers) + "mha + " |
| |
|
| | if model_cfg.o_shared_dim is not None: |
| | o_str = "." + str(model_cfg.o_shared_dim) |
| | else: |
| | o_str = "" |
| |
|
| | |
| | attn_str = ( |
| | dense_str |
| | + "mla." |
| | + str(model_cfg.q_shared_dim) |
| | + "." |
| | + str(model_cfg.kv_shared_dim) |
| | + o_str |
| | ) |
| |
|
| | |
| | if model_cfg.ffn_decompose: |
| | dense_str = ( |
| | str(model_cfg.num_dense_layers) |
| | + "mlp." |
| | + str(model_cfg.intermediate_size) |
| | + " + " |
| | ) |
| |
|
| | mlp_str = ( |
| | dense_str |
| | + str(model_cfg.num_hidden_layers - model_cfg.num_dense_layers) |
| | + "dcmp." |
| | + "x" |
| | + str(model_cfg.intermediate_size) |
| | + "." |
| | + str(model_cfg.ffn_rank) |
| | ) |
| | else: |
| | mlp_str = "mlp." + str(model_cfg.intermediate_size) |
| |
|
| | |
| | shorthand = ( |
| | f"{attn_str} - {mlp_str} - " |
| | f"h{model_cfg.hidden_size} - l{model_cfg.num_hidden_layers}" |
| | ) |
| |
|
| | """ |
| | The run name includes training settings |
| | |
| | run_name = ( |
| | f"{config['stats']['total_elements']} - " |
| | f"{attn_str} - {mlp_str} - " |
| | f"h{model_cfg.hidden_size} - l{model_cfg.num_hidden_layers} - " |
| | f"bs{ptrain_cfg['train_batch_size']} - lr{lr_str} - " |
| | f"seq{ptrain_cfg['max_seq_length']}" |
| | ) |
| | """ |
| |
|
| | return shorthand |
| |
|
| |
|
| | class SharedSpaceDecoderConfig(PretrainedConfig): |
| | r""" |
| | Configuration class for SharedSpaceDecoderConfig. |
| | |
| | Extends the HuggingFace `PretrainedConfig` to support architectural |
| | variations including: |
| | - Multi-Head Latent Attention (MLA) |
| | - Decomposed MLPs (low-rank FFNs) |
| | - Flexible attention backends (eager, flash, sdpa) |
| | - Explicit shared subspaces for Q, K, V, and O projections |
| | |
| | This config does not infer any defaults based on `hidden_size`. All |
| | dimensions and ranks must be explicitly specified. If required values are |
| | missing, a `ValueError` is raised during initialization. |
| | |
| | ---------------------- |
| | Core Model Parameters: |
| | ---------------------- |
| | - vocab_size (`int`) β Vocabulary size. |
| | - hidden_size (`int`) β Model hidden dimension. |
| | - num_hidden_layers (`int`) β Number of transformer blocks. |
| | - intermediate_size (`int`) β Feed-forward hidden dimension. |
| | - hidden_act (`str`) β Activation function. |
| | - hidden_dropout_prob (`float`) β Dropout after projections and FFNs. |
| | - attention_dropout_prob (`float`) β Dropout applied to attention scores. |
| | - max_position_embeddings (`int`) β Max sequence length. |
| | - initializer_range (`float`) β Stddev of weight init. |
| | |
| | - layer_norm_eps (`float`) β Epsilon for LayerNorm. |
| | - rms_norm_ps (`float`) β Epsilon for RMSNorm |
| | |
| | - classifier_dropout (`float` or None) β Dropout for final classifier. |
| | |
| | - vocab_subspace |
| | - vocab_rank |
| | |
| | ---------------------------------- |
| | Multi-Head Latent Attention (MLA): |
| | ---------------------------------- |
| | - num_attention_heads (`int`) β Number of attention heads. |
| | |
| | - q_shared_dim (`int`) β Rank of the shared query subspace. |
| | - kv_shared_dim (`int`) β Rank of the shared key/value subspace. |
| | |
| | - output_subspace (`bool`) β Whether to use a shared latent subspace for output projections. |
| | - o_shared_dim (`int`) β Rank of the shared output subspace (required if `output_subspace=True`). |
| | - qk_private_dim (`int`) β Query/key private dimension per head. |
| | - vo_private_dim (`int`) β Value/output private dimension per head. |
| | |
| | - rope_dims (`int`) β Number of head dimensions carrying RoPE. |
| | - nope_dims (`int`) β Non-positional encoding dimensions. |
| | - rope_theta (`float`) β Base frequency used for RoPE. |
| | - rope_scaling (`dict` or None) β HF-style scaling dict for RoPE. |
| | - attention_bias (`bool`) β Whether to include bias terms in Q/K/V projections. |
| | - num_dense_layers (`int`) β Number of leading layers that do not use |
| | subspaces for attention or FFNs. |
| | - attention_backend (`str`) β Must be one of `"eager"`, `"flash_attention_2"`, or `"sdpa"`. |
| | |
| | ---------------------- |
| | Decomposed MLP (Low-Rank FFN): |
| | ---------------------- |
| | - ffn_decompose (`bool`) β Whether to enable low-rank FFNs. |
| | - ffn_rank (`int`) β Rank of the shared FFN latent space (required if `ffn_decompose=True`). |
| | |
| | ---------------------- |
| | Validation Behavior: |
| | ---------------------- |
| | Raises `ValueError` at init time if: |
| | - FFN decomposition is enabled without specifying `ffn_rank`. |
| | - An unknown `attention_backend` is provided. |
| | """ |
| |
|
| | model_type = "shared_subspace_decoder" |
| |
|
| | def __init__( |
| | self, |
| | |
| | |
| | vocab_size: int = 30522, |
| | hidden_size: int = 512, |
| | num_hidden_layers: int = 12, |
| | |
| | intermediate_size: int = 3072, |
| | |
| | hidden_dropout_prob=0.1, |
| | attention_dropout_prob=0.1, |
| | max_position_embeddings: int = 2048, |
| | initializer_range=0.02, |
| | layer_norm_eps=1e-12, |
| | rms_norm_eps=1e-6, |
| | norm_type="layernorm", |
| | classifier_dropout=None, |
| | |
| | vocab_subspace=False, |
| | vocab_rank=None, |
| | tie_word_embeddings=True, |
| | |
| | |
| | num_attention_heads: int = 16, |
| | rope_dims: int = 16, |
| | |
| | q_shared_dim: int = None, |
| | kv_shared_dim: int = None, |
| | |
| | o_shared_dim=None, |
| | |
| | |
| | qk_private_dim: int = None, |
| | vo_private_dim: int = None, |
| | nope_dims: int = None, |
| | |
| | attention_backend="eager", |
| | rope_theta=10000.0, |
| | rope_scaling=None, |
| | attention_bias=False, |
| | |
| | |
| | num_dense_layers=12, |
| | |
| | |
| | ffn_decompose=False, |
| | ffn_rank=None, |
| | **kwargs |
| | ) -> None: |
| | super().__init__(**kwargs) |
| |
|
| |
|
| |
|
| | |
| | self.vocab_size = vocab_size |
| | self.hidden_size = hidden_size |
| | self.num_hidden_layers = num_hidden_layers |
| | self.intermediate_size = intermediate_size |
| | self.hidden_dropout_prob = hidden_dropout_prob |
| | self.attention_dropout_prob = attention_dropout_prob |
| | self.max_position_embeddings = max_position_embeddings |
| | self.initializer_range = initializer_range |
| | self.layer_norm_eps = layer_norm_eps |
| | self.rms_norm_eps = rms_norm_eps |
| | self.norm_type = norm_type |
| | self.classifier_dropout = classifier_dropout |
| |
|
| | self.vocab_subspace = vocab_subspace |
| | self.vocab_rank = vocab_rank |
| | self.tie_word_embeddings = tie_word_embeddings |
| |
|
| | |
| | self.num_attention_heads = num_attention_heads |
| | self.rope_dims = rope_dims |
| |
|
| | self.q_shared_dim = q_shared_dim |
| | self.kv_shared_dim = kv_shared_dim |
| | self.o_shared_dim = o_shared_dim |
| |
|
| | |
| | self.qk_private_dim = qk_private_dim |
| | self.vo_private_dim = vo_private_dim |
| | self.nope_dims = nope_dims |
| | self.rope_theta = rope_theta |
| | self.rope_scaling = rope_scaling |
| | self.attention_bias = attention_bias |
| | self.num_dense_layers = num_dense_layers |
| |
|
| | |
| | self.ffn_decompose = ffn_decompose |
| | self.ffn_rank = ffn_rank |
| |
|
| | |
| | self.attention_backend = attention_backend |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| |
|
| |
|
| | def _validate(self): |
| | |
| | if self.num_dense_layers > self.num_hidden_layers: |
| | raise ValueError("`num_dense_layers` must be <= `num_hidden_layers`") |
| | if self.vocab_subspace and self.vocab_rank is None: |
| | raise ValueError("`vocab_rank` must be set when `vocab_subspace=True`") |
| |
|
| | |
| | |
| | if self.num_dense_layers < self.num_hidden_layers and self.q_shared_dim is None and self.kv_shared_dim is None: |
| | raise ValueError("At least one of q_shared_dim or kv_shared_dim must be set when there are subspace layers") |
| | |
| | |
| | if self.qk_private_dim is None or self.vo_private_dim is None: |
| | raise ValueError("Must set qk_private_dim and vo_private_dim") |
| | if self.nope_dims is None: |
| | raise ValueError("Must set nope_dims") |
| |
|
| | |
| | if self.ffn_decompose and self.ffn_rank is None: |
| | raise ValueError("`ffn_rank` must be set when `ffn_decompose=True`") |
| | if self.ffn_decompose and self.num_dense_layers >= self.num_hidden_layers: |
| | raise ValueError("`ffn_decompose` was set but `num_dense` is >= number of layers") |
| |
|
| | |
| | valid_backends = ["eager", "flash_attention_2", "sdpa"] |
| | if self.attention_backend not in valid_backends: |
| | raise ValueError(f"Unknown attention backend: {self.attention_backend}, options are {valid_backends}") |
| | |
| | |
| | valid_norm_types = ["layernorm", "rmsnorm"] |
| | if self.norm_type not in valid_norm_types: |
| | raise ValueError(f"Unknown norm type: {self.norm_type}, options are {valid_norm_types}") |
| |
|
| | |
| |
|
| | import json |
| |
|
| | def get_config(filename): |
| |
|
| | |
| | with open(filename) as f: |
| | full_cfg = json.load(f) |
| |
|
| | |
| |
|
| | |
| | valid_keys = SharedSpaceDecoderConfig.__init__.__code__.co_varnames |
| | |
| | valid_keys = set(valid_keys) - {"self", "kwargs"} |
| |
|
| | |
| | extra_keys = set(full_cfg["model"]) - valid_keys |
| | missing_keys = valid_keys - set(full_cfg["model"]) |
| |
|
| | |
| | if extra_keys: |
| | |
| | raise ValueError(f"Unknown keys in config: {sorted(extra_keys)}") |
| |
|
| | |
| | if missing_keys: |
| | |
| | raise ValueError(f"config json is missing: {sorted(missing_keys)}") |
| |
|
| | |
| | |
| | |
| | model_cfg = SharedSpaceDecoderConfig(**full_cfg["model"]) |
| |
|
| | return full_cfg, model_cfg |
| |
|