|
|
| from transformers import PretrainedConfig, PreTrainedModel, AutoConfig, AutoModelForCausalLM |
| from transformers.modeling_outputs import CausalLMOutputWithPast |
| from typing import List, Optional, Tuple |
| from torch import nn |
| import torch |
| import torch.nn.functional as F |
| import math |
|
|
| repo_name = "BeardedMonster/SabiYarn-125M" |
|
|
|
|
| class GPTJXMoEConfig(PretrainedConfig): |
| """Configuration class for SabiYarn model.""" |
| |
| model_type = "sabiyarn" |
| |
| def __init__( |
| self, |
| block_size: int = 32768, |
| vocab_size: int = 52050, |
| n_layer: int = 12, |
| n_heads: int = 12, |
| n_embd: int = 768, |
| dropout: float = 0.0, |
| max_batch_size: int = 1, |
| use_kv_cache: bool = True, |
| bias: bool = False, |
| kv_cache_dtype: str = "float32", |
| |
| use_moe: bool = False, |
| num_experts: int = 4, |
| num_experts_per_tok: int = 2, |
| moe_dim: int = None, |
| **kwargs |
| ): |
| self.block_size = block_size |
| self.vocab_size = vocab_size |
| self.n_layer = n_layer |
| self.n_heads = n_heads |
| self.n_embd = n_embd |
| self.dropout = dropout |
| self.bias = bias |
| self.use_kv_cache = use_kv_cache |
| self.max_batch_size = max_batch_size |
| self.kv_cache_dtype = kv_cache_dtype |
| |
| |
| self.use_moe = use_moe |
| self.num_experts = num_experts |
| self.num_experts_per_tok = num_experts_per_tok |
| |
| self.moe_dim = moe_dim if moe_dim is not None else (4 * n_embd) |
| |
| super().__init__(**kwargs) |
|
|
|
|