Clean up SuperLinear configs, delete legacy files, update model + full_shot

Browse files

Files changed (6) hide show

config.json +0 -38
configuration_super_linear.py +22 -10
configuration_super_linear_base.py +0 -84
configuration_super_linear_fs.py +0 -90
full_shot/config.json +1 -1
modeling_super_linear.py +2 -4

config.json DELETED Viewed

@@ -1,38 +0,0 @@
-{
-  "_name_or_path": "super_linear",
-  "architectures": [
-    "SuperLinearForCausalLM"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_super_linear_base.SuperLinearConfigBase",
-    "AutoModelForCausalLM": "modeling_super_linear.SuperLinearForCausalLM"
-  },
-  "auto_regressive": 1,
-  "d_model": 128,
-  "dropout": 0.0,
-  "fft_len": 5000,
-  "freeze_experts": 1,
-  "freq_experts": "mean_naive_1/4_1/6_1/7_1/8_1/12_1/14_1/16_1/21_1/24_1/28_1/30_1/32_1/36_1/42_1/48_1/52_1/56_1/60_1/72_1/84_1/90_1/96_1/120_1/144_1/168_1/180_1/224_1/252_1/288_1/336_1/365_1/504_1/672_1/1008_1/1440_1/2016_1/3600",
-  "inf_pred_len": 96,
-  "layer_type": "RLinear",
-  "linear_checkpoints_dir": "checkpoints5",
-  "linear_checkpoints_path": "/cs/azencot_fsas/MoE/",
-  "load_linear": 0,
-  "load_weights" :0,
-  "max_horizon": 96,
-  "misc_moe": 10,
-  "mlp_gating": 0,
-  "model_type": "super_linear",
-  "moe": 1,
-  "moe_n_experts": 12,
-  "moe_temp": 1,
-  "noisy_gating_std": 0.1,
-  "noisy_gating_std_decay": 1,
-  "pred_len": 96,
-  "seq_len": 512,
-  "moe_norm": 0,
-  "top_k_experts": 12,
-  "torch_dtype": "float32",
-  "transformers_version": "4.40.1",
-  "use_fft": 1
-}

configuration_super_linear.py CHANGED Viewed

@@ -1,22 +1,22 @@
 from typing import Optional, Tuple
 import torch, torch.nn as nn, torch.nn.functional as F
-from .configuration_super_linear_base import SuperLinearConfigBase
 # 1) --------------------------------------------------------------------------
 # CONFIG
 # -----------------------------------------------------------------------------
-class SuperLinearConfig(SuperLinearConfigBase):
-    """
-    Configuration for the SuperLinear MoE time–series foundation model.
-    Only *model_type* must be unique inside transformers; the rest mirrors
-    the __init__ arguments of your original Config object.
-    """
     model_type = "super_linear"
     def __init__(
         self,
         seq_len=512,
@@ -39,7 +39,7 @@ class SuperLinearConfig(SuperLinearConfigBase):
         load_weights =0,
         misc_moe = 10,
         mlp_gating = 0,
-        moe_norm = 1,
         model_type= "super_linear",
         moe_temp = 1,
         noisy_gating_std = 0.1,
@@ -47,6 +47,12 @@ class SuperLinearConfig(SuperLinearConfigBase):
         torch_dtype = "float32",
         transformers_version = "4.40.1",
         use_fft = 1,
         **kwargs,                          # any extra CLI args
     ):
         self.seq_len         = seq_len
@@ -74,4 +80,10 @@ class SuperLinearConfig(SuperLinearConfigBase):
         self.use_fft                 = use_fft
         self.fft_len                 = fft_len
         self.dropout                 = dropout
         super().__init__(**kwargs)

 from typing import Optional, Tuple
 import torch, torch.nn as nn, torch.nn.functional as F
+from transformers import (
+    PretrainedConfig,
+    PreTrainedModel,
+    GenerationMixin,
+    AutoConfig,
+    AutoModelForCausalLM,
+)
+from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
 # 1) --------------------------------------------------------------------------
 # CONFIG
 # -----------------------------------------------------------------------------
+class SuperLinearConfig(PretrainedConfig):
     model_type = "super_linear"
     def __init__(
         self,
         seq_len=512,
         load_weights =0,
         misc_moe = 10,
         mlp_gating = 0,
+        moe_norm = 0,
         model_type= "super_linear",
         moe_temp = 1,
         noisy_gating_std = 0.1,
         torch_dtype = "float32",
         transformers_version = "4.40.1",
         use_fft = 1,
+        train_epochs = 30,
+        patience = 5,
+        lradj = "constant",
+        learning_rate = 0.05,
+        channel_ind = 0,
+        full_size = 0,
         **kwargs,                          # any extra CLI args
     ):
         self.seq_len         = seq_len
         self.use_fft                 = use_fft
         self.fft_len                 = fft_len
         self.dropout                 = dropout
+        self.train_epochs             = train_epochs
+        self.patience                = patience
+        self.lradj                  = lradj
+        self.learning_rate          = learning_rate
+        self.channel_ind            = channel_ind
+        self.full_size              = full_size
         super().__init__(**kwargs)

configuration_super_linear_base.py DELETED Viewed

@@ -1,84 +0,0 @@
-from typing import Optional, Tuple
-import torch, torch.nn as nn, torch.nn.functional as F
-from transformers import (
-    PretrainedConfig,
-    PreTrainedModel,
-    GenerationMixin,
-    AutoConfig,
-    AutoModelForCausalLM,
-)
-from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
-# 1) --------------------------------------------------------------------------
-# CONFIG
-# -----------------------------------------------------------------------------
-class SuperLinearConfigBase(PretrainedConfig):
-    """
-    Configuration for the SuperLinear MoE time–series foundation model.
-    Only *model_type* must be unique inside transformers; the rest mirrors
-    the __init__ arguments of your original Config object.
-    """
-    model_type = "super_linear"
-    def __init__(
-        self,
-        seq_len=512,
-        pred_len=96,
-        inf_pred_len=96,
-        max_horizon=96,
-        moe_n_experts=12,
-        top_k_experts=5,
-        moe =1,
-        freq_experts= 'mean_naive_1/4_1/6_1/7_1/8_1/12_1/14_1/16_1/21_1/24_1/28_1/30_1/32_1/36_1/42_1/48_1/52_1/56_1/60_1/72_1/84_1/90_1/96_1/120_1/144_1/168_1/180_1/224_1/252_1/288_1/336_1/365_1/504_1/672_1/1008_1/1440_1/2016_1/3600',
-        auto_regressive= 1,
-        d_model= 128,
-        dropout= 0.0,
-        fft_len= 5000,
-        freeze_experts= 1,
-        layer_type= "RLinear",
-        linear_checkpoints_dir= "checkpoints5",
-        linear_checkpoints_path= "/cs/azencot_fsas/MoE/",
-        load_linear = 0,
-        load_weights =0,
-        misc_moe = 10,
-        mlp_gating = 0,
-        moe_norm = 1,
-        model_type= "super_linear",
-        moe_temp = 1,
-        noisy_gating_std = 0.1,
-        noisy_gating_std_decay = 1,
-        torch_dtype = "float32",
-        transformers_version = "4.40.1",
-        use_fft = 1,
-        **kwargs,                          # any extra CLI args
-    ):
-        self.seq_len         = seq_len
-        self.moe             = moe
-        self.pred_len        = pred_len
-        self.inf_pred_len    = inf_pred_len
-        self.max_horizon     = max_horizon
-        self.auto_regressive = auto_regressive
-        self.moe_n_experts   = moe_n_experts
-        self.top_k_experts   = top_k_experts
-        self.freq_experts    = freq_experts
-        self.freeze_experts  = freeze_experts
-        self.layer_type      = layer_type
-        self.linear_checkpoints_path  = linear_checkpoints_path
-        self.linear_checkpoints_dir   = linear_checkpoints_dir
-        self.load_linear              = load_linear
-        self.load_weights             = load_weights
-        self.misc_moe                = misc_moe
-        self.noisy_gating_std        = noisy_gating_std
-        self.noisy_gating_std_decay  = noisy_gating_std_decay
-        self.d_model                 = d_model
-        self.mlp_gating              = mlp_gating
-        self.moe_norm                = moe_norm
-        self.moe_temp                = moe_temp
-        self.use_fft                 = use_fft
-        self.fft_len                 = fft_len
-        self.dropout                 = dropout
-        super().__init__(**kwargs)

configuration_super_linear_fs.py DELETED Viewed

@@ -1,90 +0,0 @@
-from typing import Optional, Tuple
-import torch, torch.nn as nn, torch.nn.functional as F
-from .configuration_super_linear_base import SuperLinearConfigBase
-from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
-# 1) --------------------------------------------------------------------------
-# CONFIG
-# -----------------------------------------------------------------------------
-class SuperLinearConfigFS(SuperLinearConfigBase):
-    """
-    Configuration for the SuperLinear MoE time–series foundation model.
-    Only *model_type* must be unique inside transformers; the rest mirrors
-    the __init__ arguments of your original Config object.
-    """
-    model_type = "super_linear"
-    def __init__(
-        self,
-        seq_len=512,
-        pred_len=96,
-        inf_pred_len=96,
-        max_horizon=96,
-        moe_n_experts=12,
-        top_k_experts=5,
-        moe =1,
-        freq_experts= 'mean_naive_1/4_1/6_1/7_1/8_1/12_1/14_1/16_1/21_1/24_1/28_1/30_1/32_1/36_1/42_1/48_1/52_1/56_1/60_1/72_1/84_1/90_1/96_1/120_1/144_1/168_1/180_1/224_1/252_1/288_1/336_1/365_1/504_1/672_1/1008_1/1440_1/2016_1/3600',
-        auto_regressive= 1,
-        d_model= 128,
-        dropout= 0.0,
-        fft_len= 5000,
-        freeze_experts= 1,
-        layer_type= "RLinear",
-        linear_checkpoints_dir= "checkpoints5",
-        linear_checkpoints_path= "/cs/azencot_fsas/MoE/",
-        load_linear = 0,
-        load_weights =0,
-        misc_moe = 10,
-        mlp_gating = 0,
-        moe_norm = 0,
-        model_type= "super_linear",
-        moe_temp = 1,
-        noisy_gating_std = 0.1,
-        noisy_gating_std_decay = 1,
-        torch_dtype = "float32",
-        transformers_version = "4.40.1",
-        use_fft = 1,
-        train_epochs = 30,
-        patience = 5,
-        lradj = "constant",
-        learning_rate = 0.05,
-        channel_ind = 0,
-        full_size = 0,
-        **kwargs,                          # any extra CLI args
-    ):
-        self.seq_len         = seq_len
-        self.moe             = moe
-        self.pred_len        = pred_len
-        self.inf_pred_len    = inf_pred_len
-        self.max_horizon     = max_horizon
-        self.auto_regressive = auto_regressive
-        self.moe_n_experts   = moe_n_experts
-        self.top_k_experts   = top_k_experts
-        self.freq_experts    = freq_experts
-        self.freeze_experts  = freeze_experts
-        self.layer_type      = layer_type
-        self.linear_checkpoints_path  = linear_checkpoints_path
-        self.linear_checkpoints_dir   = linear_checkpoints_dir
-        self.load_linear              = load_linear
-        self.load_weights             = load_weights
-        self.misc_moe                = misc_moe
-        self.noisy_gating_std        = noisy_gating_std
-        self.noisy_gating_std_decay  = noisy_gating_std_decay
-        self.d_model                 = d_model
-        self.mlp_gating              = mlp_gating
-        self.moe_norm                = moe_norm
-        self.moe_temp                = moe_temp
-        self.use_fft                 = use_fft
-        self.fft_len                 = fft_len
-        self.dropout                 = dropout
-        self.train_epochs             = train_epochs
-        self.patience                = patience
-        self.lradj                  = lradj
-        self.learning_rate          = learning_rate
-        self.channel_ind            = channel_ind
-        self.full_size              = full_size
-        super().__init__(**kwargs)

full_shot/config.json CHANGED Viewed

@@ -4,7 +4,7 @@
     "SuperLinearForCausalLM"
   ],
 "auto_map": {
-  "AutoConfig": "configuration_super_linear_fs.SuperLinearConfigFS",
   "AutoModelForCausalLM": "modeling_super_linear.SuperLinearForCausalLM"
 },
   "auto_regressive": 1,

     "SuperLinearForCausalLM"
   ],
 "auto_map": {
+  "AutoConfig": "configuration_super_linear.SuperLinearConfig",
   "AutoModelForCausalLM": "modeling_super_linear.SuperLinearForCausalLM"
 },
   "auto_regressive": 1,

modeling_super_linear.py CHANGED Viewed

@@ -4,9 +4,7 @@ import torch, torch.nn as nn, torch.nn.functional as F
 from transformers                          import (PreTrainedModel,GenerationMixin,AutoConfig,AutoModelForCausalLM,)
 from transformers.modeling_outputs         import CausalLMOutputWithCrossAttentions
-from .configuration_super_linear_base      import SuperLinearConfigBase
 from .configuration_super_linear           import SuperLinearConfig
-from .configuration_super_linear_fs        import SuperLinearConfigFS
 from typing import Tuple, Union
@@ -549,9 +547,9 @@ class superLinear(nn.Module):
 "-------------------------------------------------------------------------------------------------------------------"
 class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
-    config_class = SuperLinearConfigBase
-    def __init__(self, config: Union[SuperLinearConfig, SuperLinearConfigFS]):
         super().__init__(config)
         # the backbone keeps its own Config dataclass, so build one on‑the‑fly:

 from transformers                          import (PreTrainedModel,GenerationMixin,AutoConfig,AutoModelForCausalLM,)
 from transformers.modeling_outputs         import CausalLMOutputWithCrossAttentions
 from .configuration_super_linear           import SuperLinearConfig
 from typing import Tuple, Union
 "-------------------------------------------------------------------------------------------------------------------"
 class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
+    config_class = SuperLinearConfig
+    def __init__(self, config: SuperLinearConfig):
         super().__init__(config)
         # the backbone keeps its own Config dataclass, so build one on‑the‑fly: