LongSafari
/

hyenadna-large-1m-seqlen-hf

@@ -19,8 +19,8 @@ def fftconv(u, k, D):
     seqlen = u.shape[-1]
     fft_size = 2 * seqlen
-    k_f = torch.fft.rfft(k, n=fft_size) / fft_size
-    u_f = torch.fft.rfft(u.to(dtype=k.dtype), n=fft_size)
     if len(u.shape) > 3: k_f = k_f.unsqueeze(1)
     y = torch.fft.irfft(u_f * k_f, n=fft_size, norm='forward')[..., :seqlen]
@@ -60,11 +60,9 @@ class HyenaPositionalEmbedding(nn.Module):
         w = 2 * math.pi * t_rescaled / self.seq_len # 1, L, 1
         f = torch.linspace(1e-4, bands - 1, bands)[None, None]
-        # Matt: This is just Euler's formula, so if complex64 is a problem it can be replaced
-        # by separate sin() and cos() calls.
-        z = torch.exp(-1j * f * w)
-        z = torch.cat([t, z.real, z.imag], dim=-1)
-        # TODO Set z's LR to lr_pos_emb
         self.z = nn.Parameter(z, requires_grad=True)
         self.register_buffer("t", t)
@@ -147,7 +145,7 @@ class HyenaFilter(nn.Module):
     def filter(self, L, *args, **kwargs):
         z, t = self.pos_emb(L)
-        h = self.implicit_filter(z)
         h = self.modulation(t, h)
         return h
@@ -349,8 +347,15 @@ class HyenaDNAPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["HyenaBlock"]
     _skip_keys_device_placement = "past_key_values"
-    def _init_weights(self, initializer_range=0.02):
         # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
         #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
         #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
@@ -368,8 +373,8 @@ class HyenaDNAPreTrainedModel(PreTrainedModel):
 class HyenaDNAModel(HyenaDNAPreTrainedModel):
-    def __init__(self, config) -> None:
-        super().__init__(config)
         self.backbone = HyenaLMBackbone(config)
         self.config = config
@@ -395,8 +400,8 @@ class HyenaDNAModel(HyenaDNAPreTrainedModel):
 class HyenaDNAForCausalLM(HyenaDNAPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
         self.hyena = HyenaDNAModel(config)
         vocab_size = config.vocab_size
         if vocab_size % config.pad_vocab_size_multiple != 0:
@@ -476,9 +481,9 @@ class HyenaDNAForCausalLM(HyenaDNAPreTrainedModel):
 class HyenaDNAForSequenceClassification(HyenaDNAPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
         self.hyena = HyenaDNAModel(config)
         self.score = nn.Linear(config.d_model, self.num_labels, bias=False)

     seqlen = u.shape[-1]
     fft_size = 2 * seqlen
+    k_f = torch.fft.rfft(k.to(torch.float32), n=fft_size) / fft_size
+    u_f = torch.fft.rfft(u.to(dtype=torch.float32), n=fft_size)
     if len(u.shape) > 3: k_f = k_f.unsqueeze(1)
     y = torch.fft.irfft(u_f * k_f, n=fft_size, norm='forward')[..., :seqlen]
         w = 2 * math.pi * t_rescaled / self.seq_len # 1, L, 1
         f = torch.linspace(1e-4, bands - 1, bands)[None, None]
+        z = torch.cat([t, torch.cos(-f * w), torch.sin(-f * w)], dim=-1)
+        # The original code sets z's LR to lr_pos_emb, which is 1e-5 by default
         self.z = nn.Parameter(z, requires_grad=True)
         self.register_buffer("t", t)
     def filter(self, L, *args, **kwargs):
         z, t = self.pos_emb(L)
+        h = self.implicit_filter(z.to(dtype=self.implicit_filter[0].weight.dtype))
         h = self.modulation(t, h)
         return h
     supports_gradient_checkpointing = True
     _no_split_modules = ["HyenaBlock"]
     _skip_keys_device_placement = "past_key_values"
+    _keys_to_ignore_on_load_missing = [r"freq"]  # Shared tensors that safetensors merges
+    def _init_weights(self, module, initializer_range=0.02):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, std=initializer_range)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=initializer_range)
         # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
         #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
         #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
 class HyenaDNAModel(HyenaDNAPreTrainedModel):
+    def __init__(self, config, **kwargs) -> None:
+        super().__init__(config, **kwargs)
         self.backbone = HyenaLMBackbone(config)
         self.config = config
 class HyenaDNAForCausalLM(HyenaDNAPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
         self.hyena = HyenaDNAModel(config)
         vocab_size = config.vocab_size
         if vocab_size % config.pad_vocab_size_multiple != 0:
 class HyenaDNAForSequenceClassification(HyenaDNAPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.num_labels = kwargs.get("num_labels", config.num_labels)
         self.hyena = HyenaDNAModel(config)
         self.score = nn.Linear(config.d_model, self.num_labels, bias=False)