OpenLab-NLP
/

openlem2

@@ -135,40 +135,48 @@ class DynamicConv(layers.Layer):
         self.k = k
         self.dense = layers.Dense(d_model, activation='gelu')
         self.proj = layers.Dense(d_model)
         self.generator = layers.Dense(k, dtype='float32')
     def call(self, x):
         x_in = x
-        x = tf.cast(x, tf.float32)
-        B = tf.shape(x)[0]
-        L = tf.shape(x)[1]
-        D = tf.shape(x)[2]
-        kernels = self.generator(self.dense(x))
-        kernels = tf.nn.softmax(kernels, axis=-1)
         pad = (self.k - 1) // 2
-        x_pad = tf.pad(x, [[0,0],[pad,pad],[0,0]])
-        x_pad_4d = tf.expand_dims(x_pad, axis=1)
         patches = tf.image.extract_patches(
             images=x_pad_4d,
-            sizes=[1,1,self.k,1],
-            strides=[1,1,1,1],
-            rates=[1,1,1,1],
             padding='VALID'
-        )
         patches = tf.reshape(patches, [B, L, self.k, D])
-        kernels_exp = tf.expand_dims(kernels, axis=-1)
-        out = tf.reduce_sum(patches * kernels_exp, axis=2)
         out = self.proj(out)
-        # 🔥 원래 dtype으로 돌려줌
         return tf.cast(out, x_in.dtype)
 class MixerBlock(layers.Layer):
-    def __init__(self, seq_len, dim, token_mlp_dim, channel_mlp_dim, dropout=0.0):
         super().__init__()
         self.dim = dim
@@ -176,37 +184,43 @@ class MixerBlock(layers.Layer):
         self.ln_local = layers.LayerNormalization(epsilon=1e-6)
         self.ln_channel = layers.LayerNormalization(epsilon=1e-6)
-        # __init__
-        self.token_fc1 = layers.Dense(seq_len)
         self.token_fc2 = layers.Dense(seq_len)
-        # Channel Mixer
         self.ch_fc1 = layers.Dense(self.dim * 4)
         self.ch_fc2 = layers.Dense(self.dim)
-        self.conv1 = DynamicConv(d_model=dim, k=7)
     def call(self, x, training=None):
-        # 1. Local mixing 먼저
         y = self.ln_local(x)
         y = self.conv1(y)
         x = x + y
-        # 2. 약한 global token mixing
         y = self.ln_token(x)
-        y_t = tf.transpose(y, [0,2,1])
-        # call
-        y_t = tf.nn.gelu(self.token_fc1(y_t))
-        y_t = self.token_fc2(y_t)
-        y = tf.transpose(y_t, [0,2,1])
         x = x + y
-        # 3. Channel mixing
         y = self.ln_channel(x)
         a, b = tf.split(self.ch_fc1(y), 2, axis=-1)
         y = self.ch_fc2(a * tf.nn.gelu(b))
         x = x + y
 class L2NormLayer(layers.Layer):
     def __init__(self, axis=1, epsilon=1e-10, **kwargs):

         self.k = k
         self.dense = layers.Dense(d_model, activation='gelu')
         self.proj = layers.Dense(d_model)
+        # generator should produce k weights per token; softmax in float32 for stability
         self.generator = layers.Dense(k, dtype='float32')
     def call(self, x):
         x_in = x
+        x = tf.cast(x, tf.float32)  # softmax 안전하게 float32
+        # padding + extract patches
         pad = (self.k - 1) // 2
+        x_pad = tf.pad(x, [[0,0],[pad,pad],[0,0]])   # [B, L+2pad, D]
+        x_pad_4d = tf.expand_dims(x_pad, axis=1)     # [B, 1, L+2pad, D]
         patches = tf.image.extract_patches(
             images=x_pad_4d,
+            sizes=[1, 1, self.k, 1],
+            strides=[1, 1, 1, 1],
+            rates=[1, 1, 1, 1],
             padding='VALID'
+        )  # shape: [B, 1, L, k*D]
+        # reshape -> [B, L, k, D]
+        B = tf.shape(patches)[0]
+        L = tf.shape(patches)[2]
+        D = tf.shape(x)[2]
         patches = tf.reshape(patches, [B, L, self.k, D])
+        # generate kernels per token
+        kernels = self.generator(self.dense(x))   # [B, L, k], in float32
+        kernels = tf.nn.softmax(kernels, axis=-1) # [B, L, k]
+        kernels_exp = tf.expand_dims(kernels, axis=-1)  # [B, L, k, 1]
+        out = tf.reduce_sum(patches * kernels_exp, axis=2)  # [B, L, D]
         out = self.proj(out)
         return tf.cast(out, x_in.dtype)
+    def compute_output_shape(self, input_shape):
+        return input_shape
 class MixerBlock(layers.Layer):
+    def __init__(self, seq_len, dim, token_mlp_dim=None, channel_mlp_dim=None, dropout=0.0):
         super().__init__()
         self.dim = dim
         self.ln_local = layers.LayerNormalization(epsilon=1e-6)
         self.ln_channel = layers.LayerNormalization(epsilon=1e-6)
+        # NOTE: token_fc1 must output 2 * seq_len to allow split()
+        self.token_fc1 = layers.Dense(seq_len * 2)
         self.token_fc2 = layers.Dense(seq_len)
+        # Channel Mixer (GLU style)
         self.ch_fc1 = layers.Dense(self.dim * 4)
         self.ch_fc2 = layers.Dense(self.dim)
+        # local dynamic conv
+        self.conv1 = DynamicConv(d_model=dim, k=5)
     def call(self, x, training=None):
+        # 1) Local mixing first
         y = self.ln_local(x)
         y = self.conv1(y)
         x = x + y
+        # 2) (Weak) Global token mixing
         y = self.ln_token(x)
+        y_t = tf.transpose(y, perm=[0, 2, 1])           # [B, D, L]
+        y_t = self.token_fc1(y_t)                       # [B, D, 2*L]
+        a, b = tf.split(y_t, 2, axis=-1)                # split on last dim
+        y_t = self.token_fc2(a * tf.nn.gelu(b))         # [B, D, L]
+        y = tf.transpose(y_t, perm=[0, 2, 1])           # [B, L, D]
         x = x + y
+        # 3) Channel mixer (GLU)
         y = self.ln_channel(x)
         a, b = tf.split(self.ch_fc1(y), 2, axis=-1)
         y = self.ch_fc2(a * tf.nn.gelu(b))
         x = x + y
+        return x
+    def compute_output_shape(self, input_shape):
+        return input_shape
 class L2NormLayer(layers.Layer):
     def __init__(self, axis=1, epsilon=1e-10, **kwargs):