MartialTerran
/

SmolLM2_360M_model.py

Safetensors

Model card Files Files and versions Community

MartialTerran commited on Dec 1, 2024

Commit

95b814a

•

1 Parent(s): 0d7b558

Update SmolLM2_360M_model_debugging.py

Browse files

Files changed (1) hide show

SmolLM2_360M_model_debugging.py +26 -25

SmolLM2_360M_model_debugging.py CHANGED Viewed

@@ -48,12 +48,12 @@ import torch.nn.functional as F
 # --- Utility Functions ---
 def load_json(file_path: str) -> Dict:
-    """Load JSON data from a file."""
     with open(file_path, 'r', encoding='utf-8') as f:
         return json.load(f)
 def timed_step(start: float, step_name: str) -> float:
-    """Print time taken for a step and return new start time."""
     end = time.time()
     print(f"Time taken for {step_name}: {end - start:.4f} seconds")
     return end
@@ -61,23 +61,23 @@ def timed_step(start: float, step_name: str) -> float:
 # --- Model Architecture ---
 class RMSNorm(nn.Module):
-    """Root Mean Square Normalization."""
     def __init__(self, dim: int, eps: float = 1e-5):
         super().__init__()
         self.eps = eps
         self.weight = nn.Parameter(torch.ones(dim))
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply RMS normalization."""
         norm_x = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
         return self.weight * norm_x
 def silu(x: torch.Tensor) -> torch.Tensor:
-    """SiLU activation function."""
     return x * torch.sigmoid(x)
 class RotaryEmbedding(nn.Module):
-    """Rotary Positional Embedding."""
     def __init__(self, dim: int, base: int = 10000):
         super().__init__()
         self.dim = dim
@@ -85,23 +85,23 @@ class RotaryEmbedding(nn.Module):
         self.inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim))
     def forward(self, seq_len: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Generate rotary embeddings for a given sequence length."""
         t = torch.arange(seq_len, device=device).type_as(self.inv_freq)
         freqs = torch.outer(t, self.inv_freq)
         return torch.cat((freqs, freqs), dim=-1)
 def apply_rotary_emb(pos: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
-    """Apply rotary embeddings to the given tensor."""
     return (t * torch.cos(pos)) + (rotate_half(t) * torch.sin(pos))
 def rotate_half(x: torch.Tensor) -> torch.Tensor:
-    """Rotate half of the tensor."""
     x1 = x[..., : x.shape[-1] // 2]
     x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 class LlamaAttention(nn.Module):
-    """Multi-headed attention layer for LLaMA."""
     def __init__(self, config: Dict):
         super().__init__()
         self.config = config
@@ -121,7 +121,7 @@ class LlamaAttention(nn.Module):
         self.attn_dropout = nn.Dropout(config['attention_dropout'])
     def forward(self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, use_cache: bool = True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
-        """Compute multi-headed attention."""
         batch_size, seq_length, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
@@ -170,7 +170,7 @@ class LlamaAttention(nn.Module):
         return attn_output, present_key_value
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """Repeat hidden states n_rep times for key/value heads."""
     #Stitch1
     batch, num_key_value_heads, seq_len, head_dim = hidden_states.shape
     if n_rep == 1:
@@ -179,7 +179,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, seq_len, head_dim)
 class LlamaMLP(nn.Module):
-    """Multi-Layer Perceptron for LLaMA."""
     def __init__(self, config: Dict):
         super().__init__()
         hidden_size = config['hidden_size']
@@ -190,11 +190,11 @@ class LlamaMLP(nn.Module):
         self.act_fn = silu if config['hidden_act'] == 'silu' else getattr(F, config['hidden_act'])
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply MLP to the input tensor."""
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
 class LlamaBlock(nn.Module):
-    """LLaMA block containing attention and MLP layers."""
     def __init__(self, config: Dict):
         super().__init__()
         self.hidden_size = config['hidden_size']
@@ -204,7 +204,7 @@ class LlamaBlock(nn.Module):
         self.post_attention_layernorm = RMSNorm(self.hidden_size, eps=config['rms_norm_eps'])
     def forward(self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, use_cache: bool = True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
-        """Apply the LLaMA block."""
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
         hidden_states, present_key_value = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, use_cache=use_cache)
@@ -216,7 +216,7 @@ class LlamaBlock(nn.Module):
         return hidden_states, present_key_value
 class SmolLM2_360M(nn.Module):
-    """SmolLM2-360M model implementation."""
     def __init__(self, config_path: str):
         super().__init__()
         self.config = load_json(config_path)
@@ -247,7 +247,7 @@ class SmolLM2_360M(nn.Module):
         self.past_keys_values = None
     def load_weights(self, weights_path: str):
-        """Load weights from a safetensors file."""
         start = time.time()
         try:
             from safetensors import safe_open
@@ -274,7 +274,7 @@ class SmolLM2_360M(nn.Module):
         end = timed_step(start, "Weight Loading")
     def forward(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None, use_cache: Optional[bool] = None) -> Tuple[torch.Tensor, Optional[List[Tuple[torch.Tensor, torch.Tensor]]]]:
-        """Forward pass of the model."""
         use_cache = use_cache if use_cache is not None else self.use_cache
         batch_size, seq_length = input_ids.shape
         if position_ids is None:
@@ -311,7 +311,7 @@ class SmolLM2_360M(nn.Module):
 # --- Tokenizer ---
 class SmolLM2Tokenizer:
-    """Tokenizer for SmolLM2-360M using SentencePiece or a rudimentary BPE."""
     def __init__(self, tokenizer_path: str = ".", special_tokens_map_path: str = ".", config_path: str = "."):
         self.tokenizer_path = tokenizer_path
         self.special_tokens_map_path = special_tokens_map_path
@@ -355,7 +355,7 @@ class SmolLM2Tokenizer:
             self.additional_special_tokens_ids = [self.token_to_id.get(token, -1) for token in self.additional_special_tokens]
     def update_special_tokens_from_sp(self):
-        """Update special token IDs from SentencePiece model, if present."""
         for token_name, token_data in self.special_tokens_map.items():
              sp_id = self.sp_model.piece_to_id(token_data['content'])
              if sp_id != self.sp_model.unk_id():
@@ -387,7 +387,7 @@ class SmolLM2Tokenizer:
     def bpe(self, token: str) -> List[str]:
-        """Rudimentary BPE tokenization."""
         if not self.use_sentencepiece:
             word = list(token)
             while len(word) > 1:
@@ -412,7 +412,7 @@ class SmolLM2Tokenizer:
             return [] # If SentencePiece is used, this function is not called.
     def encode(self, text: str, add_special_tokens: bool = True) -> List[int]:
-        """Encode text to token IDs."""
         if self.use_sentencepiece:
             if add_special_tokens:
                 return self.sp_model.encode(text, out_type=int) #add_bos=True, add_eos=True if needed, adjust as per model requirement
@@ -428,7 +428,7 @@ class SmolLM2Tokenizer:
             return token_ids
     def decode(self, token_ids: List[int]) -> str:
-        """Decode token IDs to text."""
         if self.use_sentencepiece:
             return self.sp_model.decode(token_ids)
         else:
@@ -439,7 +439,7 @@ class SmolLM2Tokenizer:
 # --- Inference ---
 def generate_text(model: SmolLM2_360M, tokenizer: SmolLM2Tokenizer, prompt: str, MAX_GENERATION_LENGTH: int = 100, device: torch.device = 'cpu') -> str:
-    """Generate text using greedy decoding."""
     input_ids = tokenizer.encode(prompt, add_special_tokens=True)
     input_ids = torch.tensor([input_ids], dtype=torch.long, device=device)
@@ -503,4 +503,5 @@ if __name__ == "__main__":
             generated_text = generate_text(model, tokenizer, user_input, MAX_GENERATION_LENGTH=MAX_GENERATION_LENGTH, device=device)
             print(f"Generated Text: {generated_text}")
             end = timed_step(start, "Prompt Generation")

 # --- Utility Functions ---
 def load_json(file_path: str) -> Dict:
+    ###Load JSON data from a file.###
     with open(file_path, 'r', encoding='utf-8') as f:
         return json.load(f)
 def timed_step(start: float, step_name: str) -> float:
+    ###Print time taken for a step and return new start time.###
     end = time.time()
     print(f"Time taken for {step_name}: {end - start:.4f} seconds")
     return end
 # --- Model Architecture ---
 class RMSNorm(nn.Module):
+    ###Root Mean Square Normalization.###
     def __init__(self, dim: int, eps: float = 1e-5):
         super().__init__()
         self.eps = eps
         self.weight = nn.Parameter(torch.ones(dim))
     def forward(self, x: torch.Tensor) -> torch.Tensor:
+        ###Apply RMS normalization.###
         norm_x = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
         return self.weight * norm_x
 def silu(x: torch.Tensor) -> torch.Tensor:
+    ###SiLU activation function.###
     return x * torch.sigmoid(x)
 class RotaryEmbedding(nn.Module):
+    ###Rotary Positional Embedding.###
     def __init__(self, dim: int, base: int = 10000):
         super().__init__()
         self.dim = dim
         self.inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim))
     def forward(self, seq_len: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+        ###Generate rotary embeddings for a given sequence length.###
         t = torch.arange(seq_len, device=device).type_as(self.inv_freq)
         freqs = torch.outer(t, self.inv_freq)
         return torch.cat((freqs, freqs), dim=-1)
 def apply_rotary_emb(pos: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+    ###Apply rotary embeddings to the given tensor.###
     return (t * torch.cos(pos)) + (rotate_half(t) * torch.sin(pos))
 def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    ###Rotate half of the tensor.###
     x1 = x[..., : x.shape[-1] // 2]
     x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 class LlamaAttention(nn.Module):
+    ###Multi-headed attention layer for LLaMA.###
     def __init__(self, config: Dict):
         super().__init__()
         self.config = config
         self.attn_dropout = nn.Dropout(config['attention_dropout'])
     def forward(self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, use_cache: bool = True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        ###Compute multi-headed attention.###
         batch_size, seq_length, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
         return attn_output, present_key_value
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    ###Repeat hidden states n_rep times for key/value heads.###
     #Stitch1
     batch, num_key_value_heads, seq_len, head_dim = hidden_states.shape
     if n_rep == 1:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, seq_len, head_dim)
 class LlamaMLP(nn.Module):
+    ###Multi-Layer Perceptron for LLaMA.###
     def __init__(self, config: Dict):
         super().__init__()
         hidden_size = config['hidden_size']
         self.act_fn = silu if config['hidden_act'] == 'silu' else getattr(F, config['hidden_act'])
     def forward(self, x: torch.Tensor) -> torch.Tensor:
+        ###Apply MLP to the input tensor.###
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
 class LlamaBlock(nn.Module):
+    ###LLaMA block containing attention and MLP layers.###
     def __init__(self, config: Dict):
         super().__init__()
         self.hidden_size = config['hidden_size']
         self.post_attention_layernorm = RMSNorm(self.hidden_size, eps=config['rms_norm_eps'])
     def forward(self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, use_cache: bool = True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        ###Apply the LLaMA block.###
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
         hidden_states, present_key_value = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, use_cache=use_cache)
         return hidden_states, present_key_value
 class SmolLM2_360M(nn.Module):
+    ###SmolLM2-360M model implementation.###
     def __init__(self, config_path: str):
         super().__init__()
         self.config = load_json(config_path)
         self.past_keys_values = None
     def load_weights(self, weights_path: str):
+        ###Load weights from a safetensors file.###
         start = time.time()
         try:
             from safetensors import safe_open
         end = timed_step(start, "Weight Loading")
     def forward(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None, use_cache: Optional[bool] = None) -> Tuple[torch.Tensor, Optional[List[Tuple[torch.Tensor, torch.Tensor]]]]:
+        ###Forward pass of the model.###
         use_cache = use_cache if use_cache is not None else self.use_cache
         batch_size, seq_length = input_ids.shape
         if position_ids is None:
 # --- Tokenizer ---
 class SmolLM2Tokenizer:
+    ###Tokenizer for SmolLM2-360M using SentencePiece or a rudimentary BPE.###
     def __init__(self, tokenizer_path: str = ".", special_tokens_map_path: str = ".", config_path: str = "."):
         self.tokenizer_path = tokenizer_path
         self.special_tokens_map_path = special_tokens_map_path
             self.additional_special_tokens_ids = [self.token_to_id.get(token, -1) for token in self.additional_special_tokens]
     def update_special_tokens_from_sp(self):
+        ###Update special token IDs from SentencePiece model, if present.###
         for token_name, token_data in self.special_tokens_map.items():
              sp_id = self.sp_model.piece_to_id(token_data['content'])
              if sp_id != self.sp_model.unk_id():
     def bpe(self, token: str) -> List[str]:
+        ###Rudimentary BPE tokenization.###
         if not self.use_sentencepiece:
             word = list(token)
             while len(word) > 1:
             return [] # If SentencePiece is used, this function is not called.
     def encode(self, text: str, add_special_tokens: bool = True) -> List[int]:
+        ###Encode text to token IDs.###
         if self.use_sentencepiece:
             if add_special_tokens:
                 return self.sp_model.encode(text, out_type=int) #add_bos=True, add_eos=True if needed, adjust as per model requirement
             return token_ids
     def decode(self, token_ids: List[int]) -> str:
+        ###Decode token IDs to text.###
         if self.use_sentencepiece:
             return self.sp_model.decode(token_ids)
         else:
 # --- Inference ---
 def generate_text(model: SmolLM2_360M, tokenizer: SmolLM2Tokenizer, prompt: str, MAX_GENERATION_LENGTH: int = 100, device: torch.device = 'cpu') -> str:
+    ###Generate text using greedy decoding.###
     input_ids = tokenizer.encode(prompt, add_special_tokens=True)
     input_ids = torch.tensor([input_ids], dtype=torch.long, device=device)
             generated_text = generate_text(model, tokenizer, user_input, MAX_GENERATION_LENGTH=MAX_GENERATION_LENGTH, device=device)
             print(f"Generated Text: {generated_text}")
             end = timed_step(start, "Prompt Generation")