OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on Nov 24

Commit

696479e

verified ·

1 Parent(s): 2cdfb37

Update AlphaS2S.py

Browse files

Files changed (1) hide show

AlphaS2S.py +52 -111

AlphaS2S.py CHANGED Viewed

@@ -183,136 +183,77 @@ class SwiGLU(layers.Layer):
         x_val, x_gate = tf.split(x_proj, 2, axis=-1)
         return self.out(x_val * tf.nn.silu(x_gate))
-class gMLPBlock(layers.Layer):
-    def __init__(self, d_model, seq_len, dropout=0.1):
-        super().__init__()
-        self.d_model = d_model
-        self.seq_len = seq_len
-        self.norm = layers.LayerNormalization(epsilon=1e-6)
-        # FFN: Channel Expansion
-        # d_model * 4로 확장
-        self.channel_proj = layers.Dense(d_model * 4, use_bias=True)
-        self.dropout = layers.Dropout(dropout)
-        # Spatial Gating Unit (SGU)
-        self.sgu_norm = layers.LayerNormalization(epsilon=1e-6)
-        self.sgu_proj = layers.Dense(seq_len, use_bias=False)
-        # 출력 차원을 d_model * 2 (U의 차원)로 설정
-        self.sgu_final = layers.Dense(d_model * 2, use_bias=True)
-        self.out_proj = layers.Dense(d_model, use_bias=True)
-    def call(self, x, training=False):
-        # 1. Norm and Channel Expansion
-        residual = x
-        x_norm = self.norm(x)
-        x_proj = self.channel_proj(x_norm) # Shape: (B, L, 4*D)
-        # 2. Split (U and V streams)
-        u, v = tf.split(x_proj, 2, axis=-1) # u, v Shape: (B, L, 2*D)
-        # 3. Spatial Gating Unit (SGU)
-        v_norm = self.sgu_norm(v)
-        v_norm_T = tf.transpose(v_norm, perm=[0, 2, 1]) # (B, 2D, L)
-        # 💡 토큰 믹싱 발생 (시퀀스 축으로 Dense 적용)
-        v_proj = self.sgu_proj(v_norm_T) # (B, 2D, L)
-        v_proj_T = tf.transpose(v_proj, perm=[0, 2, 1]) # (B, L, 2D)
-        # 4. Activation and Gate Generation
-        # 표준 gMLP는 U에 GELU를 적용하고 V는 선형 게이트로 사용
-        # 여기서는 U에 GELU를 적용
-        u_act = tf.nn.gelu(u)
-        v_gate = self.sgu_final(v_proj_T) # Shape: (B, L, 2*D)
-        # 5. Gating and Contraction
-        z = u_act * v_gate # 게이팅
-        z = self.dropout(z, training=training)
-        out = self.out_proj(z) # Shape: (B, L, D)
-        # 6. Residual Connection
-        return residual + out
-class CrossBlock(layers.Layer):
-    def __init__(self, clip_value=5.0, eps=1e-6): # 💡 d_model 인자 추가
         super().__init__()
-        self.clip_value = clip_value
-        self.eps = eps
-        self.attn = layers.MultiHeadAttention(8, 20)
-        # 💡 수정: 출력 차원을 1에서 d_model로 변경
-    def call(self, x, z):
-        y = self.attn(x, z, z)
-        return y
-class LoU(layers.Layer):
-    def __init__(self, d_model, clip_value=5.0, eps=1e-6):
         super().__init__()
-        self.d_model = d_model
-        self.clip_value = float(clip_value)
-        self.mha = layers.MultiHeadAttention(8, 20)
-        self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
-        self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
-        self.glu = SwiGLU(d_model, 350)
-        self.cross = CrossBlock()
-    def call(self, x, z):
-        x_f32 = tf.cast(x, tf.float32)
-        residual = x_f32
-        x = self.norm1(x)
-        x_comb = self.mha(x, x, x, use_causal_mask=True)
-        out = self.norm(x_comb + residual)
-        out = self.cross(out, z)
-        out = self.glu(out)
-        return tf.cast(out, x.dtype)
-# =======================
-# 4) AlphaS2S 모델 (기존 코드 유지)
-# =======================
-class AlphaS2S(tf.keras.Model):
-    def __init__(self, num_layers, d_model, num_heads, input_vocab_size, target_vocab_size, max_len=200, dropout=0.1):
         super().__init__()
         self.max_len = max_len
         self.d_model = d_model
-        # 인코더와 디코더 임베딩 및 위치 임베딩은 모두 max_len을 사용
         self.enc_embedding = layers.Embedding(input_vocab_size, d_model)
         self.enc_pos_embedding = layers.Embedding(max_len, d_model)
         self.dec_embedding = layers.Embedding(target_vocab_size, d_model)
         self.dec_pos_embedding = layers.Embedding(max_len, d_model)
-        # EncoderBlock과 LoU는 기존 코드와 동일한 구조
-        self.enc_layers = [gMLPBlock(d_model, seq_len=max_len) for _ in range(num_layers)]
-        self.dec_layers = [LoU(d_model) for _ in range(num_layers)]
-        self.final_layer = layers.Dense(target_vocab_size, use_bias=False)
     def call(self, inputs, training=False):
-        # enc_inputs와 dec_inputs는 동일한 시퀀스 (Unified Input)
-        enc_inputs = inputs["enc_inputs"]
         dec_inputs = inputs["dec_inputs"]
         enc_pos = tf.range(tf.shape(enc_inputs)[1])[tf.newaxis, :]
         dec_pos = tf.range(tf.shape(dec_inputs)[1])[tf.newaxis, :]
-        # 인코더 실행
         x = self.enc_embedding(enc_inputs) + self.enc_pos_embedding(enc_pos)
-        # Note: 마스크 없음 -> Bi-directional (BERT-like Encoder)
         for layer in self.enc_layers: x = layer(x, training=training)
-        enc_out = x # 인코더의 최종 출력 (디코더의 'z' 입력)
-        # 디코더 실행
         y = self.dec_embedding(dec_inputs) + self.dec_pos_embedding(dec_pos)
-        for layer in self.dec_layers: y = layer(y, enc_out, training=training)
         return self.final_layer(y)
-# =======================
 # 5) 학습 설정 및 실행
 # =======================
@@ -343,7 +284,7 @@ def create_lr_schedule(initial_lr=5e-5, decay_steps=10000, decay_rate=0.9):
 with strategy.scope():
     # ⚠️ 수정: chat_vocab_size 대신 정의된 vocab_size 사용
-    chat_model = AlphaS2S(num_layers=4, d_model=160, num_heads=8,
                              input_vocab_size=vocab_size, target_vocab_size=vocab_size, max_len=max_len)
     dummy_input = {

         x_val, x_gate = tf.split(x_proj, 2, axis=-1)
         return self.out(x_val * tf.nn.silu(x_gate))
+class SwiGLU(layers.Layer):
+    def __init__(self, d_model, d_ff):
+        super().__init__()
+        self.proj = layers.Dense(d_ff*2)
+        self.out = layers.Dense(d_model)
+    def call(self, x):
+        x_proj = self.proj(x)
+        x_val, x_gate = tf.split(x_proj, 2, axis=-1)
+        return self.out(x_val * tf.nn.silu(x_gate))
+class EncoderBlock(layers.Layer):
+    def __init__(self, d_model, num_heads, dff, dropout=0.1):
         super().__init__()
+        self.mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
+        self.ffn = SwiGLU(d_model, dff)
+        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
+        self.norm2 = layers.LayerNormalization(epsilon=1e-6)
+        self.dropout1 = layers.Dropout(dropout)
+        self.dropout2 = layers.Dropout(dropout)
+    def call(self, x, mask=None, training=False):
+        attn_out = self.dropout1(self.mha(x, x, x, attention_mask=mask), training=training)
+        out1 = self.norm1(x + attn_out)
+        ffn_out = self.dropout2(self.ffn(out1), training=training)
+        return self.norm2(out1 + ffn_out)
+class DecoderBlock(layers.Layer):
+    def __init__(self, d_model, num_heads, dff, dropout=0.1):
         super().__init__()
+        self.self_mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
+        self.cross_mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
+        self.ffn = SwiGLU(d_model, dff)
+        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
+        self.norm2 = layers.LayerNormalization(epsilon=1e-6)
+        self.norm3 = layers.LayerNormalization(epsilon=1e-6)
+        self.dropout1 = layers.Dropout(dropout)
+        self.dropout2 = layers.Dropout(dropout)
+        self.dropout3 = layers.Dropout(dropout)
+    def call(self, x, enc_out, training=False):
+        attn1 = self.dropout1(self.self_mha(x, x, x, use_causal_mask=True), training=training)
+        out1 = self.norm1(x + attn1)
+        attn2 = self.dropout2(self.cross_mha(out1, enc_out, enc_out), training=training)
+        out2 = self.norm2(out1 + attn2)
+        ffn_out = self.dropout3(self.ffn(out2), training=training)
+        return self.norm3(out2 + ffn_out)
+class Transformer(tf.keras.Model):
+    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, max_len=100, dropout=0.1):
         super().__init__()
         self.max_len = max_len
         self.d_model = d_model
         self.enc_embedding = layers.Embedding(input_vocab_size, d_model)
         self.enc_pos_embedding = layers.Embedding(max_len, d_model)
         self.dec_embedding = layers.Embedding(target_vocab_size, d_model)
         self.dec_pos_embedding = layers.Embedding(max_len, d_model)
+        self.enc_layers = [EncoderBlock(d_model, num_heads, dff, dropout) for _ in range(num_layers)]
+        self.dec_layers = [DecoderBlock(d_model, num_heads, dff, dropout) for _ in range(num_layers)]
+        self.final_layer = layers.Dense(target_vocab_size)
     def call(self, inputs, training=False):
+        enc_inputs = inputs["enc_inputs"]
         dec_inputs = inputs["dec_inputs"]
         enc_pos = tf.range(tf.shape(enc_inputs)[1])[tf.newaxis, :]
         dec_pos = tf.range(tf.shape(dec_inputs)[1])[tf.newaxis, :]
         x = self.enc_embedding(enc_inputs) + self.enc_pos_embedding(enc_pos)
         for layer in self.enc_layers: x = layer(x, training=training)
+        enc_out = x
         y = self.dec_embedding(dec_inputs) + self.dec_pos_embedding(dec_pos)
+        for layer in self.dec_layers: y = layer(y, enc_out, training=training)
         return self.final_layer(y)
 # 5) 학습 설정 및 실행
 # =======================
 with strategy.scope():
     # ⚠️ 수정: chat_vocab_size 대신 정의된 vocab_size 사용
+    chat_model = Transformer(num_layers=4, d_model=160, num_heads=8,
                              input_vocab_size=vocab_size, target_vocab_size=vocab_size, max_len=max_len)
     dummy_input = {