feat: placeholders for more config
Browse files
src/dalle_mini/model/configuration.py
CHANGED
@@ -65,6 +65,8 @@ class DalleBartConfig(PretrainedFromWandbMixin, PretrainedConfig):
|
|
65 |
tau_init=0.05, # used only in cosine attention (Swin v2)
|
66 |
use_deepnet_scaling=False, # used in Deepnet
|
67 |
use_glu=False, # "GLU Variants Improve Transformer"
|
|
|
|
|
68 |
# parameters that should not be necessary but could affect results
|
69 |
force_ln_scale=True, # force scale in layernorm even when followed by dense layers
|
70 |
force_final_ln_encoder=False, # force layer normalization in encoder final layer even when followed by dense layers
|
@@ -88,11 +90,14 @@ class DalleBartConfig(PretrainedFromWandbMixin, PretrainedConfig):
|
|
88 |
], "ln_positions must be 'normformer', 'swinv2' or 'deepnet'"
|
89 |
if ln_positions == "deepnet":
|
90 |
ln_positions = "postln"
|
|
|
91 |
self.ln_positions = ln_positions
|
92 |
self.use_cosine_attention = use_cosine_attention
|
93 |
self.tau_init = tau_init
|
94 |
self.use_deepnet_scaling = use_deepnet_scaling
|
95 |
self.use_glu = use_glu
|
|
|
|
|
96 |
self.force_ln_scale = force_ln_scale
|
97 |
self.force_final_ln_encoder = force_final_ln_encoder
|
98 |
|
|
|
65 |
tau_init=0.05, # used only in cosine attention (Swin v2)
|
66 |
use_deepnet_scaling=False, # used in Deepnet
|
67 |
use_glu=False, # "GLU Variants Improve Transformer"
|
68 |
+
use_alibi=False, # from "Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation"
|
69 |
+
sink_iters=1, # used in SinkFormers
|
70 |
# parameters that should not be necessary but could affect results
|
71 |
force_ln_scale=True, # force scale in layernorm even when followed by dense layers
|
72 |
force_final_ln_encoder=False, # force layer normalization in encoder final layer even when followed by dense layers
|
|
|
90 |
], "ln_positions must be 'normformer', 'swinv2' or 'deepnet'"
|
91 |
if ln_positions == "deepnet":
|
92 |
ln_positions = "postln"
|
93 |
+
assert use_alibi is False, "use_alibi is not supported yet"
|
94 |
self.ln_positions = ln_positions
|
95 |
self.use_cosine_attention = use_cosine_attention
|
96 |
self.tau_init = tau_init
|
97 |
self.use_deepnet_scaling = use_deepnet_scaling
|
98 |
self.use_glu = use_glu
|
99 |
+
self.use_alibi = use_alibi
|
100 |
+
self.sink_iters = sink_iters
|
101 |
self.force_ln_scale = force_ln_scale
|
102 |
self.force_final_ln_encoder = force_final_ln_encoder
|
103 |
|