OpenLab-NLP
/

HyperConv-Layer

Model card Files Files and versions

xet

Community

OpenLab-NLP commited on 15 days ago

Commit

303d1df

verified ·

1 Parent(s): b37bc5a

Update V2.py

Browse files

Files changed (1) hide show

V2.py +32 -64

V2.py CHANGED Viewed

@@ -129,54 +129,34 @@ ds = ds.map(lambda v1, v2: ((v1, v2), tf.zeros([BATCH_SIZE], dtype=tf.float32)),
 ds = ds.prefetch(tf.data.AUTOTUNE)
-class HyperConv1D(layers.Layer):
-    def __init__(self, d_model, k=7, hyper_dim=128, dropout=0.0):
         super().__init__()
         assert k % 2 == 1
         self.k = k
-        self.d_model = d_model
-        # Input projection
-        self.input_proj = layers.Dense(d_model, name="input_proj")
-        # Dynamic kernel conv
-        self.dynamic_dense = layers.Dense(d_model, activation='silu')
-        self.dynamic_proj = layers.Dense(d_model)
-        self.kernel_generator = layers.Dense(k, dtype='float32')
-        # Hypernetwork for token-wise features
-        self.hyper = tf.keras.Sequential([
-            layers.Dense(hyper_dim, activation='gelu'),
-            layers.Dense(d_model)
-        ], name="hyper")
-        # Attention pooling for global context
-        self.attn_pool = layers.Dense(1)
-        # LayerNorm + Dropout
-        self.norm = layers.LayerNormalization()
-        self.dropout = layers.Dropout(dropout)
-        self.dense = layers.Dense(d_model)
-    def call(self, x, training=None):
         x_in = x
-        x_dtype = x.dtype  # 입력 dtype 저장
-        # 1) Input projection
-        x_proj = self.input_proj(x)  # (B, L, D)
-        B = tf.shape(x_proj)[0]
-        L = tf.shape(x_proj)[1]
-        D = self.d_model
-        pad = (self.k - 1) // 2
-        # ------------------------------
-        # 2) DynamicConv local mixing
-        # ------------------------------
-        kernels = self.kernel_generator(self.dynamic_dense(x_proj))
-        kernels = tf.cast(kernels, x_proj.dtype)
         kernels = tf.nn.softmax(kernels, axis=-1)
-        x_pad = tf.pad(x_proj, [[0,0],[pad,pad],[0,0]])
         x_pad_4d = tf.expand_dims(x_pad, axis=1)
         patches = tf.image.extract_patches(
             images=x_pad_4d,
@@ -186,30 +166,15 @@ class HyperConv1D(layers.Layer):
             padding='VALID'
         )
         patches = tf.reshape(patches, [B, L, self.k, D])
         kernels_exp = tf.expand_dims(kernels, axis=-1)
-        out_local = tf.reduce_sum(patches * kernels_exp, axis=2)
-        out_local = self.dynamic_proj(out_local)
-        # ------------------------------
-        # 3) Global context via attention pooling (scale 제거)
-        # ------------------------------
-        h = self.hyper(x_proj)
-        scores = tf.nn.softmax(self.attn_pool(h), axis=1)  # (B, L, 1)
-        global_context = tf.reduce_sum(h * scores, axis=1)   # (B, D)
-        # token-wise concat
-        global_context_exp = tf.expand_dims(global_context, 1) * tf.ones([B, L, 1], dtype=x_proj.dtype)
-        out_local = tf.concat([out_local, global_context_exp], axis=-1)
-        out_local = self.dense(out_local)  # dimension 맞춤
-        # ------------------------------
-        # 4) Residual + SiLU + LayerNorm
-        # ------------------------------
-        out = x_proj + out_local
-        out = tf.nn.silu(out)
-        out = self.norm(out)
-        out = self.dropout(out, training=training)
-        return tf.cast(out, x_dtype)
 class L2NormLayer(layers.Layer):
@@ -227,11 +192,14 @@ class SentenceEncoder(Model):
         self.embed = layers.Embedding(vocab_size, embed_dim)
         self.pos_embed = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
         self.dropout = layers.Dropout(dropout_rate)
-        self.blocks = [HyperConv1D(d_model=embed_dim, k=7, hyper_dim=256) for _ in range(4)]
         self.attn_pool = layers.Dense(1)
         self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
         self.latent = layers.Dense(latent_dim, activation=None)
         self.l2norm = L2NormLayer(axis=1)
         self.fc1 = layers.Dense(1152)
         self.fc2 = layers.Dense(embed_dim)

 ds = ds.prefetch(tf.data.AUTOTUNE)
+class DynamicConv(layers.Layer):
+    def __init__(self, d_model, k=7):
         super().__init__()
         assert k % 2 == 1
         self.k = k
+        self.dense = layers.Dense(d_model, activation='silu')
+        self.proj = layers.Dense(d_model)
+        self.generator = layers.Dense(k, dtype='float32')
+        self.ln1 = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
+        self.ln2 = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
+    def call(self, x):
         x_in = x
+        x = tf.cast(x, tf.float32)
+        x = self.ln1(x)
+        B = tf.shape(x)[0]
+        L = tf.shape(x)[1]
+        D = tf.shape(x)[2]
+        kernels = self.generator(self.dense(x))
         kernels = tf.nn.softmax(kernels, axis=-1)
+        pad = (self.k - 1) // 2
+        x_pad = tf.pad(x, [[0,0],[pad,pad],[0,0]])
         x_pad_4d = tf.expand_dims(x_pad, axis=1)
         patches = tf.image.extract_patches(
             images=x_pad_4d,
             padding='VALID'
         )
         patches = tf.reshape(patches, [B, L, self.k, D])
         kernels_exp = tf.expand_dims(kernels, axis=-1)
+        out = tf.reduce_sum(patches * kernels_exp, axis=2)
+        out = self.proj(out)
+        out = tf.nn.gelu(out)
+        out = x + self.ln2(out)
+        # 🔥 원래 dtype으로 돌려줌
+        return tf.cast(out, x_in.dtype)
 class L2NormLayer(layers.Layer):
         self.embed = layers.Embedding(vocab_size, embed_dim)
         self.pos_embed = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
         self.dropout = layers.Dropout(dropout_rate)
+        self.blocks = [DynamicConv(d_model=embed_dim, k=7) for _ in range(4)]
         self.attn_pool = layers.Dense(1)
         self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
         self.latent = layers.Dense(latent_dim, activation=None)
         self.l2norm = L2NormLayer(axis=1)
         self.fc1 = layers.Dense(1152)
         self.fc2 = layers.Dense(embed_dim)