Clean up SuperLinear configs, delete legacy files, update model + full_shot
Browse files- config.json +0 -38
- configuration_super_linear.py +22 -10
- configuration_super_linear_base.py +0 -84
- configuration_super_linear_fs.py +0 -90
- full_shot/config.json +1 -1
- modeling_super_linear.py +2 -4
config.json
DELETED
|
@@ -1,38 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"_name_or_path": "super_linear",
|
| 3 |
-
"architectures": [
|
| 4 |
-
"SuperLinearForCausalLM"
|
| 5 |
-
],
|
| 6 |
-
"auto_map": {
|
| 7 |
-
"AutoConfig": "configuration_super_linear_base.SuperLinearConfigBase",
|
| 8 |
-
"AutoModelForCausalLM": "modeling_super_linear.SuperLinearForCausalLM"
|
| 9 |
-
},
|
| 10 |
-
"auto_regressive": 1,
|
| 11 |
-
"d_model": 128,
|
| 12 |
-
"dropout": 0.0,
|
| 13 |
-
"fft_len": 5000,
|
| 14 |
-
"freeze_experts": 1,
|
| 15 |
-
"freq_experts": "mean_naive_1/4_1/6_1/7_1/8_1/12_1/14_1/16_1/21_1/24_1/28_1/30_1/32_1/36_1/42_1/48_1/52_1/56_1/60_1/72_1/84_1/90_1/96_1/120_1/144_1/168_1/180_1/224_1/252_1/288_1/336_1/365_1/504_1/672_1/1008_1/1440_1/2016_1/3600",
|
| 16 |
-
"inf_pred_len": 96,
|
| 17 |
-
"layer_type": "RLinear",
|
| 18 |
-
"linear_checkpoints_dir": "checkpoints5",
|
| 19 |
-
"linear_checkpoints_path": "/cs/azencot_fsas/MoE/",
|
| 20 |
-
"load_linear": 0,
|
| 21 |
-
"load_weights" :0,
|
| 22 |
-
"max_horizon": 96,
|
| 23 |
-
"misc_moe": 10,
|
| 24 |
-
"mlp_gating": 0,
|
| 25 |
-
"model_type": "super_linear",
|
| 26 |
-
"moe": 1,
|
| 27 |
-
"moe_n_experts": 12,
|
| 28 |
-
"moe_temp": 1,
|
| 29 |
-
"noisy_gating_std": 0.1,
|
| 30 |
-
"noisy_gating_std_decay": 1,
|
| 31 |
-
"pred_len": 96,
|
| 32 |
-
"seq_len": 512,
|
| 33 |
-
"moe_norm": 0,
|
| 34 |
-
"top_k_experts": 12,
|
| 35 |
-
"torch_dtype": "float32",
|
| 36 |
-
"transformers_version": "4.40.1",
|
| 37 |
-
"use_fft": 1
|
| 38 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configuration_super_linear.py
CHANGED
|
@@ -1,22 +1,22 @@
|
|
| 1 |
from typing import Optional, Tuple
|
| 2 |
import torch, torch.nn as nn, torch.nn.functional as F
|
| 3 |
-
from
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
# 1) --------------------------------------------------------------------------
|
| 7 |
# CONFIG
|
| 8 |
# -----------------------------------------------------------------------------
|
| 9 |
|
| 10 |
|
| 11 |
-
class SuperLinearConfig(
|
| 12 |
-
"""
|
| 13 |
-
Configuration for the SuperLinear MoE time–series foundation model.
|
| 14 |
-
Only *model_type* must be unique inside transformers; the rest mirrors
|
| 15 |
-
the __init__ arguments of your original Config object.
|
| 16 |
-
"""
|
| 17 |
|
| 18 |
model_type = "super_linear"
|
| 19 |
-
|
| 20 |
def __init__(
|
| 21 |
self,
|
| 22 |
seq_len=512,
|
|
@@ -39,7 +39,7 @@ class SuperLinearConfig(SuperLinearConfigBase):
|
|
| 39 |
load_weights =0,
|
| 40 |
misc_moe = 10,
|
| 41 |
mlp_gating = 0,
|
| 42 |
-
moe_norm =
|
| 43 |
model_type= "super_linear",
|
| 44 |
moe_temp = 1,
|
| 45 |
noisy_gating_std = 0.1,
|
|
@@ -47,6 +47,12 @@ class SuperLinearConfig(SuperLinearConfigBase):
|
|
| 47 |
torch_dtype = "float32",
|
| 48 |
transformers_version = "4.40.1",
|
| 49 |
use_fft = 1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
**kwargs, # any extra CLI args
|
| 51 |
):
|
| 52 |
self.seq_len = seq_len
|
|
@@ -74,4 +80,10 @@ class SuperLinearConfig(SuperLinearConfigBase):
|
|
| 74 |
self.use_fft = use_fft
|
| 75 |
self.fft_len = fft_len
|
| 76 |
self.dropout = dropout
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
super().__init__(**kwargs)
|
|
|
|
| 1 |
from typing import Optional, Tuple
|
| 2 |
import torch, torch.nn as nn, torch.nn.functional as F
|
| 3 |
+
from transformers import (
|
| 4 |
+
PretrainedConfig,
|
| 5 |
+
PreTrainedModel,
|
| 6 |
+
GenerationMixin,
|
| 7 |
+
AutoConfig,
|
| 8 |
+
AutoModelForCausalLM,
|
| 9 |
+
)
|
| 10 |
+
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
| 11 |
|
| 12 |
# 1) --------------------------------------------------------------------------
|
| 13 |
# CONFIG
|
| 14 |
# -----------------------------------------------------------------------------
|
| 15 |
|
| 16 |
|
| 17 |
+
class SuperLinearConfig(PretrainedConfig):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
model_type = "super_linear"
|
|
|
|
| 20 |
def __init__(
|
| 21 |
self,
|
| 22 |
seq_len=512,
|
|
|
|
| 39 |
load_weights =0,
|
| 40 |
misc_moe = 10,
|
| 41 |
mlp_gating = 0,
|
| 42 |
+
moe_norm = 0,
|
| 43 |
model_type= "super_linear",
|
| 44 |
moe_temp = 1,
|
| 45 |
noisy_gating_std = 0.1,
|
|
|
|
| 47 |
torch_dtype = "float32",
|
| 48 |
transformers_version = "4.40.1",
|
| 49 |
use_fft = 1,
|
| 50 |
+
train_epochs = 30,
|
| 51 |
+
patience = 5,
|
| 52 |
+
lradj = "constant",
|
| 53 |
+
learning_rate = 0.05,
|
| 54 |
+
channel_ind = 0,
|
| 55 |
+
full_size = 0,
|
| 56 |
**kwargs, # any extra CLI args
|
| 57 |
):
|
| 58 |
self.seq_len = seq_len
|
|
|
|
| 80 |
self.use_fft = use_fft
|
| 81 |
self.fft_len = fft_len
|
| 82 |
self.dropout = dropout
|
| 83 |
+
self.train_epochs = train_epochs
|
| 84 |
+
self.patience = patience
|
| 85 |
+
self.lradj = lradj
|
| 86 |
+
self.learning_rate = learning_rate
|
| 87 |
+
self.channel_ind = channel_ind
|
| 88 |
+
self.full_size = full_size
|
| 89 |
super().__init__(**kwargs)
|
configuration_super_linear_base.py
DELETED
|
@@ -1,84 +0,0 @@
|
|
| 1 |
-
from typing import Optional, Tuple
|
| 2 |
-
import torch, torch.nn as nn, torch.nn.functional as F
|
| 3 |
-
|
| 4 |
-
from transformers import (
|
| 5 |
-
PretrainedConfig,
|
| 6 |
-
PreTrainedModel,
|
| 7 |
-
GenerationMixin,
|
| 8 |
-
AutoConfig,
|
| 9 |
-
AutoModelForCausalLM,
|
| 10 |
-
)
|
| 11 |
-
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
| 12 |
-
|
| 13 |
-
# 1) --------------------------------------------------------------------------
|
| 14 |
-
# CONFIG
|
| 15 |
-
# -----------------------------------------------------------------------------
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
class SuperLinearConfigBase(PretrainedConfig):
|
| 19 |
-
"""
|
| 20 |
-
Configuration for the SuperLinear MoE time–series foundation model.
|
| 21 |
-
Only *model_type* must be unique inside transformers; the rest mirrors
|
| 22 |
-
the __init__ arguments of your original Config object.
|
| 23 |
-
"""
|
| 24 |
-
|
| 25 |
-
model_type = "super_linear"
|
| 26 |
-
|
| 27 |
-
def __init__(
|
| 28 |
-
self,
|
| 29 |
-
seq_len=512,
|
| 30 |
-
pred_len=96,
|
| 31 |
-
inf_pred_len=96,
|
| 32 |
-
max_horizon=96,
|
| 33 |
-
moe_n_experts=12,
|
| 34 |
-
top_k_experts=5,
|
| 35 |
-
moe =1,
|
| 36 |
-
freq_experts= 'mean_naive_1/4_1/6_1/7_1/8_1/12_1/14_1/16_1/21_1/24_1/28_1/30_1/32_1/36_1/42_1/48_1/52_1/56_1/60_1/72_1/84_1/90_1/96_1/120_1/144_1/168_1/180_1/224_1/252_1/288_1/336_1/365_1/504_1/672_1/1008_1/1440_1/2016_1/3600',
|
| 37 |
-
auto_regressive= 1,
|
| 38 |
-
d_model= 128,
|
| 39 |
-
dropout= 0.0,
|
| 40 |
-
fft_len= 5000,
|
| 41 |
-
freeze_experts= 1,
|
| 42 |
-
layer_type= "RLinear",
|
| 43 |
-
linear_checkpoints_dir= "checkpoints5",
|
| 44 |
-
linear_checkpoints_path= "/cs/azencot_fsas/MoE/",
|
| 45 |
-
load_linear = 0,
|
| 46 |
-
load_weights =0,
|
| 47 |
-
misc_moe = 10,
|
| 48 |
-
mlp_gating = 0,
|
| 49 |
-
moe_norm = 1,
|
| 50 |
-
model_type= "super_linear",
|
| 51 |
-
moe_temp = 1,
|
| 52 |
-
noisy_gating_std = 0.1,
|
| 53 |
-
noisy_gating_std_decay = 1,
|
| 54 |
-
torch_dtype = "float32",
|
| 55 |
-
transformers_version = "4.40.1",
|
| 56 |
-
use_fft = 1,
|
| 57 |
-
**kwargs, # any extra CLI args
|
| 58 |
-
):
|
| 59 |
-
self.seq_len = seq_len
|
| 60 |
-
self.moe = moe
|
| 61 |
-
self.pred_len = pred_len
|
| 62 |
-
self.inf_pred_len = inf_pred_len
|
| 63 |
-
self.max_horizon = max_horizon
|
| 64 |
-
self.auto_regressive = auto_regressive
|
| 65 |
-
self.moe_n_experts = moe_n_experts
|
| 66 |
-
self.top_k_experts = top_k_experts
|
| 67 |
-
self.freq_experts = freq_experts
|
| 68 |
-
self.freeze_experts = freeze_experts
|
| 69 |
-
self.layer_type = layer_type
|
| 70 |
-
self.linear_checkpoints_path = linear_checkpoints_path
|
| 71 |
-
self.linear_checkpoints_dir = linear_checkpoints_dir
|
| 72 |
-
self.load_linear = load_linear
|
| 73 |
-
self.load_weights = load_weights
|
| 74 |
-
self.misc_moe = misc_moe
|
| 75 |
-
self.noisy_gating_std = noisy_gating_std
|
| 76 |
-
self.noisy_gating_std_decay = noisy_gating_std_decay
|
| 77 |
-
self.d_model = d_model
|
| 78 |
-
self.mlp_gating = mlp_gating
|
| 79 |
-
self.moe_norm = moe_norm
|
| 80 |
-
self.moe_temp = moe_temp
|
| 81 |
-
self.use_fft = use_fft
|
| 82 |
-
self.fft_len = fft_len
|
| 83 |
-
self.dropout = dropout
|
| 84 |
-
super().__init__(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configuration_super_linear_fs.py
DELETED
|
@@ -1,90 +0,0 @@
|
|
| 1 |
-
from typing import Optional, Tuple
|
| 2 |
-
import torch, torch.nn as nn, torch.nn.functional as F
|
| 3 |
-
|
| 4 |
-
from .configuration_super_linear_base import SuperLinearConfigBase
|
| 5 |
-
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
| 6 |
-
|
| 7 |
-
# 1) --------------------------------------------------------------------------
|
| 8 |
-
# CONFIG
|
| 9 |
-
# -----------------------------------------------------------------------------
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
class SuperLinearConfigFS(SuperLinearConfigBase):
|
| 13 |
-
"""
|
| 14 |
-
Configuration for the SuperLinear MoE time–series foundation model.
|
| 15 |
-
Only *model_type* must be unique inside transformers; the rest mirrors
|
| 16 |
-
the __init__ arguments of your original Config object.
|
| 17 |
-
"""
|
| 18 |
-
|
| 19 |
-
model_type = "super_linear"
|
| 20 |
-
|
| 21 |
-
def __init__(
|
| 22 |
-
self,
|
| 23 |
-
seq_len=512,
|
| 24 |
-
pred_len=96,
|
| 25 |
-
inf_pred_len=96,
|
| 26 |
-
max_horizon=96,
|
| 27 |
-
moe_n_experts=12,
|
| 28 |
-
top_k_experts=5,
|
| 29 |
-
moe =1,
|
| 30 |
-
freq_experts= 'mean_naive_1/4_1/6_1/7_1/8_1/12_1/14_1/16_1/21_1/24_1/28_1/30_1/32_1/36_1/42_1/48_1/52_1/56_1/60_1/72_1/84_1/90_1/96_1/120_1/144_1/168_1/180_1/224_1/252_1/288_1/336_1/365_1/504_1/672_1/1008_1/1440_1/2016_1/3600',
|
| 31 |
-
auto_regressive= 1,
|
| 32 |
-
d_model= 128,
|
| 33 |
-
dropout= 0.0,
|
| 34 |
-
fft_len= 5000,
|
| 35 |
-
freeze_experts= 1,
|
| 36 |
-
layer_type= "RLinear",
|
| 37 |
-
linear_checkpoints_dir= "checkpoints5",
|
| 38 |
-
linear_checkpoints_path= "/cs/azencot_fsas/MoE/",
|
| 39 |
-
load_linear = 0,
|
| 40 |
-
load_weights =0,
|
| 41 |
-
misc_moe = 10,
|
| 42 |
-
mlp_gating = 0,
|
| 43 |
-
moe_norm = 0,
|
| 44 |
-
model_type= "super_linear",
|
| 45 |
-
moe_temp = 1,
|
| 46 |
-
noisy_gating_std = 0.1,
|
| 47 |
-
noisy_gating_std_decay = 1,
|
| 48 |
-
torch_dtype = "float32",
|
| 49 |
-
transformers_version = "4.40.1",
|
| 50 |
-
use_fft = 1,
|
| 51 |
-
train_epochs = 30,
|
| 52 |
-
patience = 5,
|
| 53 |
-
lradj = "constant",
|
| 54 |
-
learning_rate = 0.05,
|
| 55 |
-
channel_ind = 0,
|
| 56 |
-
full_size = 0,
|
| 57 |
-
**kwargs, # any extra CLI args
|
| 58 |
-
):
|
| 59 |
-
self.seq_len = seq_len
|
| 60 |
-
self.moe = moe
|
| 61 |
-
self.pred_len = pred_len
|
| 62 |
-
self.inf_pred_len = inf_pred_len
|
| 63 |
-
self.max_horizon = max_horizon
|
| 64 |
-
self.auto_regressive = auto_regressive
|
| 65 |
-
self.moe_n_experts = moe_n_experts
|
| 66 |
-
self.top_k_experts = top_k_experts
|
| 67 |
-
self.freq_experts = freq_experts
|
| 68 |
-
self.freeze_experts = freeze_experts
|
| 69 |
-
self.layer_type = layer_type
|
| 70 |
-
self.linear_checkpoints_path = linear_checkpoints_path
|
| 71 |
-
self.linear_checkpoints_dir = linear_checkpoints_dir
|
| 72 |
-
self.load_linear = load_linear
|
| 73 |
-
self.load_weights = load_weights
|
| 74 |
-
self.misc_moe = misc_moe
|
| 75 |
-
self.noisy_gating_std = noisy_gating_std
|
| 76 |
-
self.noisy_gating_std_decay = noisy_gating_std_decay
|
| 77 |
-
self.d_model = d_model
|
| 78 |
-
self.mlp_gating = mlp_gating
|
| 79 |
-
self.moe_norm = moe_norm
|
| 80 |
-
self.moe_temp = moe_temp
|
| 81 |
-
self.use_fft = use_fft
|
| 82 |
-
self.fft_len = fft_len
|
| 83 |
-
self.dropout = dropout
|
| 84 |
-
self.train_epochs = train_epochs
|
| 85 |
-
self.patience = patience
|
| 86 |
-
self.lradj = lradj
|
| 87 |
-
self.learning_rate = learning_rate
|
| 88 |
-
self.channel_ind = channel_ind
|
| 89 |
-
self.full_size = full_size
|
| 90 |
-
super().__init__(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
full_shot/config.json
CHANGED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
"SuperLinearForCausalLM"
|
| 5 |
],
|
| 6 |
"auto_map": {
|
| 7 |
-
"AutoConfig": "
|
| 8 |
"AutoModelForCausalLM": "modeling_super_linear.SuperLinearForCausalLM"
|
| 9 |
},
|
| 10 |
"auto_regressive": 1,
|
|
|
|
| 4 |
"SuperLinearForCausalLM"
|
| 5 |
],
|
| 6 |
"auto_map": {
|
| 7 |
+
"AutoConfig": "configuration_super_linear.SuperLinearConfig",
|
| 8 |
"AutoModelForCausalLM": "modeling_super_linear.SuperLinearForCausalLM"
|
| 9 |
},
|
| 10 |
"auto_regressive": 1,
|
modeling_super_linear.py
CHANGED
|
@@ -4,9 +4,7 @@ import torch, torch.nn as nn, torch.nn.functional as F
|
|
| 4 |
|
| 5 |
from transformers import (PreTrainedModel,GenerationMixin,AutoConfig,AutoModelForCausalLM,)
|
| 6 |
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
| 7 |
-
from .configuration_super_linear_base import SuperLinearConfigBase
|
| 8 |
from .configuration_super_linear import SuperLinearConfig
|
| 9 |
-
from .configuration_super_linear_fs import SuperLinearConfigFS
|
| 10 |
|
| 11 |
from typing import Tuple, Union
|
| 12 |
|
|
@@ -549,9 +547,9 @@ class superLinear(nn.Module):
|
|
| 549 |
"-------------------------------------------------------------------------------------------------------------------"
|
| 550 |
class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
|
| 551 |
|
| 552 |
-
config_class =
|
| 553 |
|
| 554 |
-
def __init__(self, config:
|
| 555 |
super().__init__(config)
|
| 556 |
|
| 557 |
# the backbone keeps its own Config dataclass, so build one on‑the‑fly:
|
|
|
|
| 4 |
|
| 5 |
from transformers import (PreTrainedModel,GenerationMixin,AutoConfig,AutoModelForCausalLM,)
|
| 6 |
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
|
|
|
| 7 |
from .configuration_super_linear import SuperLinearConfig
|
|
|
|
| 8 |
|
| 9 |
from typing import Tuple, Union
|
| 10 |
|
|
|
|
| 547 |
"-------------------------------------------------------------------------------------------------------------------"
|
| 548 |
class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
|
| 549 |
|
| 550 |
+
config_class = SuperLinearConfig
|
| 551 |
|
| 552 |
+
def __init__(self, config: SuperLinearConfig):
|
| 553 |
super().__init__(config)
|
| 554 |
|
| 555 |
# the backbone keeps its own Config dataclass, so build one on‑the‑fly:
|