Instructions to use merterbak/Seed-0.5B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use merterbak/Seed-0.5B with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="merterbak/Seed-0.5B", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("merterbak/Seed-0.5B", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use merterbak/Seed-0.5B with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "merterbak/Seed-0.5B" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "merterbak/Seed-0.5B", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/merterbak/Seed-0.5B
- SGLang
How to use merterbak/Seed-0.5B with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "merterbak/Seed-0.5B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "merterbak/Seed-0.5B", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "merterbak/Seed-0.5B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "merterbak/Seed-0.5B", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use merterbak/Seed-0.5B with Docker Model Runner:
docker model run hf.co/merterbak/Seed-0.5B
| import math | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from transformers import PreTrainedModel | |
| from transformers.cache_utils import DynamicCache | |
| from transformers.generation import GenerationMixin | |
| from transformers.modeling_outputs import CausalLMOutputWithPast | |
| from .configuration_seed import SeedConfig | |
| class RMSNorm(nn.Module): | |
| def __init__(self, dim, eps=1e-6): | |
| super().__init__() | |
| self.epsilon = eps | |
| self.weight = nn.Parameter(torch.ones(dim)) | |
| def forward(self, x): | |
| x = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.epsilon) * self.weight | |
| return x | |
| class RoPEEmbedding(nn.Module): | |
| def __init__(self, config, device=None): | |
| super().__init__() | |
| self.config = config | |
| assert config.n_embd % config.n_head == 0 | |
| self.head_dim = config.head_dim | |
| self.rope_scaling_type = str(getattr(config, "rope_scaling_type", "none")) | |
| self.rope_scaling_factor = float(getattr(config, "rope_scaling_factor", 1.0)) | |
| base = float(config.rope_theta) | |
| self.position_scale = 1.0 | |
| self.attention_scaling = 1.0 | |
| if self.rope_scaling_type == "none" or self.rope_scaling_factor == 1.0: | |
| pass | |
| elif self.rope_scaling_type == "yarn": | |
| base = base * (self.rope_scaling_factor ** (self.head_dim / (self.head_dim - 2.0))) | |
| self.attention_scaling = 0.1 * math.log(self.rope_scaling_factor) + 1.0 | |
| elif self.rope_scaling_type == "ntk": | |
| base = base * (self.rope_scaling_factor ** (self.head_dim / (self.head_dim - 2.0))) | |
| else: | |
| raise ValueError(f"Unknown rope_scaling_type={self.rope_scaling_type!r}") | |
| self.base = base | |
| inv_freq = 1.0 / ( | |
| self.base | |
| ** (torch.arange(0, self.head_dim, 2, dtype=torch.float32, device=device) / float(self.head_dim)) | |
| ) | |
| self.register_buffer("inv_freq", inv_freq, persistent=False) | |
| def forward(self, x, position_ids): | |
| dtype = x.dtype | |
| pos = position_ids.float().unsqueeze(-1) * self.position_scale | |
| inv_freq = self.inv_freq.unsqueeze(0).unsqueeze(0) | |
| freqs = pos * inv_freq | |
| emb = torch.cat([freqs, freqs], dim=-1) | |
| cos = (emb.cos() * self.attention_scaling).to(dtype) | |
| sin = (emb.sin() * self.attention_scaling).to(dtype) | |
| return cos, sin | |
| def rotate_half(x): | |
| x1, x2 = x.chunk(2, dim=-1) | |
| return torch.cat([-x2, x1], dim=-1) | |
| def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1): | |
| cos = cos.unsqueeze(unsqueeze_dim) | |
| sin = sin.unsqueeze(unsqueeze_dim) | |
| q = (q * cos) + (rotate_half(q) * sin) | |
| k = (k * cos) + (rotate_half(k) * sin) | |
| return q, k | |
| class GQA(nn.Module): | |
| def __init__(self, config, layer_idx): | |
| super().__init__() | |
| self.layer_idx = int(layer_idx) | |
| self.n_head = config.n_head | |
| self.n_kv_head = int(getattr(config, "n_kv_head", config.n_head)) | |
| self.n_embd = config.n_embd | |
| self.block_size = int(config.block_size) | |
| assert 1 <= self.n_kv_head <= self.n_head | |
| assert self.n_head % self.n_kv_head == 0 | |
| self.head_dim = config.head_dim | |
| q_dim = self.n_head * self.head_dim | |
| kv_dim = self.n_kv_head * self.head_dim | |
| self.q_proj = nn.Linear(self.n_embd, q_dim, bias=False) | |
| self.k_proj = nn.Linear(self.n_embd, kv_dim, bias=False) | |
| self.v_proj = nn.Linear(self.n_embd, kv_dim, bias=False) | |
| self.o_proj = nn.Linear(q_dim, self.n_embd, bias=False) | |
| self.q_norm = RMSNorm(self.head_dim) | |
| self.k_norm = RMSNorm(self.head_dim) | |
| def forward(self, x, cos, sin, past_key_values=None): | |
| B, T, C = x.shape | |
| q = self.q_proj(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2) | |
| k = self.k_proj(x).view(B, T, self.n_kv_head, self.head_dim).transpose(1, 2) | |
| v = self.v_proj(x).view(B, T, self.n_kv_head, self.head_dim).transpose(1, 2) | |
| q = self.q_norm(q) | |
| k = self.k_norm(k) | |
| q, k = apply_rotary_pos_emb(q, k, cos, sin) | |
| past_len = 0 | |
| if past_key_values is not None: | |
| past_len = past_key_values.get_seq_length(self.layer_idx) | |
| k, v = past_key_values.update(k, v, self.layer_idx) | |
| if self.n_kv_head != self.n_head: | |
| repeat_factor = self.n_head // self.n_kv_head | |
| k = k.repeat_interleave(repeat_factor, dim=1) | |
| v = v.repeat_interleave(repeat_factor, dim=1) | |
| if past_len == 0: | |
| y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, is_causal=True) | |
| else: | |
| Tk = int(k.size(2)) | |
| query_pos = past_len + torch.arange(T, device=x.device) | |
| key_pos = torch.arange(Tk, device=x.device) | |
| causal_mask = key_pos.unsqueeze(0) <= query_pos.unsqueeze(1) | |
| attn_mask = torch.zeros((1, 1, T, Tk), device=x.device, dtype=q.dtype) | |
| attn_mask = attn_mask.masked_fill(~causal_mask.view(1, 1, T, Tk), torch.finfo(q.dtype).min) | |
| y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, is_causal=False) | |
| y = y.transpose(1, 2).contiguous().view(B, T, -1) | |
| y = self.o_proj(y) | |
| return y | |
| class SwiGLU(nn.Module): | |
| def __init__(self, config): | |
| super().__init__() | |
| self.n_embd = config.n_embd | |
| hidden_dim = getattr(config, "mlp_hidden_dim", None) | |
| if hidden_dim is None: | |
| hidden_dim = int(4 * self.n_embd * 2 / 3) | |
| hidden_dim = (hidden_dim + 255) // 256 * 256 | |
| self.gate_proj = nn.Linear(self.n_embd, hidden_dim, bias=config.bias) | |
| self.up_proj = nn.Linear(self.n_embd, hidden_dim, bias=config.bias) | |
| self.down_proj = nn.Linear(hidden_dim, self.n_embd, bias=config.bias) | |
| def forward(self, x): | |
| gate = self.gate_proj(x) | |
| up = self.up_proj(x) | |
| x = self.down_proj(F.silu(gate) * up) | |
| return x | |
| class DecoderLayer(nn.Module): | |
| def __init__(self, config, layer_idx): | |
| super().__init__() | |
| self.input_norm = RMSNorm(config.n_embd, eps=config.rms_norm_eps) | |
| self.post_attn_norm = RMSNorm(config.n_embd, eps=config.rms_norm_eps) | |
| self.attn = GQA(config, layer_idx=layer_idx) | |
| self.mlp = SwiGLU(config) | |
| def forward(self, x, cos, sin, past_key_values=None): | |
| residual = x | |
| x = self.input_norm(x) | |
| x = self.attn(x, cos, sin, past_key_values=past_key_values) | |
| x = residual + x | |
| residual = x | |
| x = self.post_attn_norm(x) | |
| x = self.mlp(x) | |
| x = residual + x | |
| return x | |
| class SeedPreTrainedModel(PreTrainedModel): | |
| config_class = SeedConfig | |
| base_model_prefix = "model" | |
| _no_split_modules = ["DecoderLayer"] | |
| _skip_keys_device_placement = ["past_key_values"] | |
| _supports_sdpa = True | |
| class SeedForCausalLM(SeedPreTrainedModel, GenerationMixin): | |
| _tied_weights_keys = ["lm_head.weight"] | |
| def __init__(self, config): | |
| super().__init__(config) | |
| self.config = config | |
| self.wte = nn.Embedding(config.vocab_size, config.n_embd) | |
| self.layers = nn.ModuleList([DecoderLayer(config, layer_idx=i) for i in range(config.n_layer)]) | |
| self.norm = RMSNorm(config.n_embd, eps=config.rms_norm_eps) | |
| self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) | |
| self.rope = RoPEEmbedding(config) | |
| self.post_init() | |
| def get_input_embeddings(self): | |
| return self.wte | |
| def set_input_embeddings(self, value): | |
| self.wte = value | |
| def get_output_embeddings(self): | |
| return self.lm_head | |
| def set_output_embeddings(self, new_embeddings): | |
| self.lm_head = new_embeddings | |
| def forward( | |
| self, | |
| input_ids=None, | |
| attention_mask=None, | |
| position_ids=None, | |
| past_key_values=None, | |
| inputs_embeds=None, | |
| labels=None, | |
| use_cache=None, | |
| token_type_ids=None, | |
| **kwargs | |
| ): | |
| if inputs_embeds is None: | |
| inputs_embeds = self.wte(input_ids) | |
| B, T = inputs_embeds.shape[:2] | |
| if use_cache and past_key_values is None: | |
| past_key_values = DynamicCache() | |
| if position_ids is None: | |
| past_seen = past_key_values.get_seq_length() if past_key_values is not None else 0 | |
| position_ids = torch.arange(past_seen, past_seen + T, device=inputs_embeds.device).unsqueeze(0).expand(B, T) | |
| cos, sin = self.rope(inputs_embeds, position_ids) | |
| x = inputs_embeds | |
| for layer in self.layers: | |
| x = layer(x, cos, sin, past_key_values=past_key_values) | |
| x = self.norm(x) | |
| logits = self.lm_head(x) | |
| loss = None | |
| if labels is not None: | |
| loss = F.cross_entropy( | |
| logits[:, :-1].contiguous().view(-1, logits.size(-1)), | |
| labels[:, 1:].contiguous().view(-1) | |
| ) | |
| return CausalLMOutputWithPast( | |
| loss=loss, | |
| logits=logits, | |
| past_key_values=past_key_values if use_cache else None | |
| ) | |
| def prepare_inputs_for_generation( | |
| self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs | |
| ): | |
| past_length = 0 | |
| if past_key_values is not None: | |
| past_length = past_key_values.get_seq_length() | |
| if past_length > 0: | |
| input_ids = input_ids[:, -1:] | |
| if inputs_embeds is not None and past_length == 0: | |
| model_inputs = {"inputs_embeds": inputs_embeds} | |
| else: | |
| model_inputs = {"input_ids": input_ids} | |
| model_inputs.update({ | |
| "past_key_values": past_key_values, | |
| "use_cache": True, | |
| "attention_mask": attention_mask, | |
| }) | |
| return model_inputs | |