OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on Nov 23

Commit

5e06e87

verified ·

1 Parent(s): ec2770f

Update Inference.py

Browse files

Files changed (1) hide show

Inference.py +81 -119

Inference.py CHANGED Viewed

@@ -77,10 +77,6 @@ def text_to_ids(text):
 def ids_to_text(ids):
     return sp.decode(ids)
-# =======================
-# 3) 모델 레이어 (기존 코드 유지)
-# =======================
 class SwiGLU(layers.Layer):
     def __init__(self, d_model, d_ff):
         super().__init__()
@@ -91,139 +87,105 @@ class SwiGLU(layers.Layer):
         x_val, x_gate = tf.split(x_proj, 2, axis=-1)
         return self.out(x_val * tf.nn.silu(x_gate))
-class gMLPBlock(layers.Layer):
-    def __init__(self, d_model, seq_len, dropout=0.1):
-        super().__init__()
-        self.d_model = d_model
-        self.seq_len = seq_len
-        self.norm = layers.LayerNormalization(epsilon=1e-6)
-        # FFN: Channel Expansion
-        # d_model * 4로 확장
-        self.channel_proj = layers.Dense(d_model * 4, use_bias=True)
-        self.dropout = layers.Dropout(dropout)
-        # Spatial Gating Unit (SGU)
-        self.sgu_norm = layers.LayerNormalization(epsilon=1e-6)
-        self.sgu_proj = layers.Dense(seq_len, use_bias=False)
-        # 출력 차원을 d_model * 2 (U의 차원)로 설정
-        self.sgu_final = layers.Dense(d_model * 2, use_bias=True)
-        self.out_proj = layers.Dense(d_model, use_bias=True)
-    def call(self, x, training=False):
-        # 1. Norm and Channel Expansion
-        residual = x
-        x_norm = self.norm(x)
-        x_proj = self.channel_proj(x_norm) # Shape: (B, L, 4*D)
-        # 2. Split (U and V streams)
-        u, v = tf.split(x_proj, 2, axis=-1) # u, v Shape: (B, L, 2*D)
-        # 3. Spatial Gating Unit (SGU)
-        v_norm = self.sgu_norm(v)
-        v_norm_T = tf.transpose(v_norm, perm=[0, 2, 1]) # (B, 2D, L)
-        # 💡 토큰 믹싱 발생 (시퀀스 축으로 Dense 적용)
-        v_proj = self.sgu_proj(v_norm_T) # (B, 2D, L)
-        v_proj_T = tf.transpose(v_proj, perm=[0, 2, 1]) # (B, L, 2D)
-        # 4. Activation and Gate Generation
-        # 표준 gMLP는 U에 GELU를 적용하고 V는 선형 게이트로 사용
-        # 여기서는 U에 GELU를 적용
-        u_act = tf.nn.gelu(u)
-        v_gate = self.sgu_final(v_proj_T) # Shape: (B, L, 2*D)
-        # 5. Gating and Contraction
-        z = u_act * v_gate # 게이팅
-        z = self.dropout(z, training=training)
-        out = self.out_proj(z) # Shape: (B, L, D)
-        # 6. Residual Connection
-        return residual + out
-class CrossBlock(layers.Layer):
-    def __init__(self, clip_value=5.0, eps=1e-6): # 💡 d_model 인자 추가
-        super().__init__()
-        self.clip_value = clip_value
-        self.eps = eps
-        self.attn = layers.MultiHeadAttention(8, 20)
-        # 💡 수정: 출력 차원을 1에서 d_model로 변경
-    def call(self, x, z):
-        y = self.attn(x, z, z)
-        return y
 class LoU(layers.Layer):
     def __init__(self, d_model, clip_value=5.0, eps=1e-6):
         super().__init__()
         self.d_model = d_model
         self.clip_value = float(clip_value)
-        self.mha = layers.MultiHeadAttention(8, 20)
-        self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
         self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
-        self.glu = SwiGLU(d_model, 350)
-        self.cross = CrossBlock()
-    def call(self, x, z):
         x_f32 = tf.cast(x, tf.float32)
         residual = x_f32
-        x = self.norm1(x)
-        x_comb = self.mha(x, x, x, use_causal_mask=True)
         out = self.norm(x_comb + residual)
-        out = self.cross(out, z)
         out = self.glu(out)
         return tf.cast(out, x.dtype)
-# =======================
-# 4) AlphaS2S 모델 (기존 코드 유지)
-# =======================
-class AlphaS2S(tf.keras.Model):
-    def __init__(self, num_layers, d_model, num_heads, input_vocab_size, target_vocab_size, max_len=200, dropout=0.1):
         super().__init__()
-        self.max_len = max_len
-        self.d_model = d_model
-        # 인코더와 디코더 임베딩 및 위치 임베딩은 모두 max_len을 사용
-        self.enc_embedding = layers.Embedding(input_vocab_size, d_model)
-        self.enc_pos_embedding = layers.Embedding(max_len, d_model)
-        self.dec_embedding = layers.Embedding(target_vocab_size, d_model)
-        self.dec_pos_embedding = layers.Embedding(max_len, d_model)
-        # EncoderBlock과 LoU는 기존 코드와 동일한 구조
-        self.enc_layers = [gMLPBlock(d_model, seq_len=max_len) for _ in range(num_layers)]
-        self.dec_layers = [LoU(d_model) for _ in range(num_layers)]
-        self.final_layer = layers.Dense(target_vocab_size, use_bias=False)
-    def call(self, inputs, training=False):
-        # enc_inputs와 dec_inputs는 동일한 시퀀스 (Unified Input)
-        enc_inputs = inputs["enc_inputs"]
-        dec_inputs = inputs["dec_inputs"]
-        enc_pos = tf.range(tf.shape(enc_inputs)[1])[tf.newaxis, :]
-        dec_pos = tf.range(tf.shape(dec_inputs)[1])[tf.newaxis, :]
-        # 인코더 실행
-        x = self.enc_embedding(enc_inputs) + self.enc_pos_embedding(enc_pos)
-        # Note: 마스크 없음 -> Bi-directional (BERT-like Encoder)
-        for layer in self.enc_layers: x = layer(x, training=training)
-        enc_out = x # 인코더의 최종 출력 (디코더의 'z' 입력)
-        # 디코더 실행
-        y = self.dec_embedding(dec_inputs) + self.dec_pos_embedding(dec_pos)
-        # Note: LoU는 내부적으로 EMA를 사용하며, 일반적인 Cross-Attention 블록의 역할을 수행
-        for layer in self.dec_layers: y = layer(y, enc_out, training=training)
-        return self.final_layer(y)
-# 가중치 저장
-chat_model = AlphaS2S(num_layers=4, d_model=160, num_heads=8,
-                             input_vocab_size=vocab_size, target_vocab_size=vocab_size, max_len=max_len)
 dummy_input = {
         "enc_inputs": tf.zeros((1, max_len), dtype=tf.int32),
         "dec_inputs": tf.zeros((1, max_len), dtype=tf.int32)

 def ids_to_text(ids):
     return sp.decode(ids)
 class SwiGLU(layers.Layer):
     def __init__(self, d_model, d_ff):
         super().__init__()
         x_val, x_gate = tf.split(x_proj, 2, axis=-1)
         return self.out(x_val * tf.nn.silu(x_gate))
 class LoU(layers.Layer):
     def __init__(self, d_model, clip_value=5.0, eps=1e-6):
         super().__init__()
         self.d_model = d_model
         self.clip_value = float(clip_value)
+        self.eps = float(eps)
+        self.Q = layers.Dense(d_model, dtype='float32')
+        self.K = layers.Dense(d_model, dtype='float32')
+        self.V = layers.Dense(d_model, dtype='float32')
         self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
+        self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
+        self.glu = SwiGLU(d_model, 320)
+    def call(self, x):
         x_f32 = tf.cast(x, tf.float32)
         residual = x_f32
+        x_f32 = self.norm1(x)
+        q = self.Q(x_f32)
+        k = self.K(x_f32)
+        V = self.V(x_f32)
+        g_q = (tf.nn.tanh(q) + 1.0) / 2.0
+        g_k = (tf.nn.tanh(k) + 1.0) / 2.0
+        score = g_q * g_k
+        score = tf.cumsum(score, axis=1) # (B, L, D)
+        # 💡 수정된 부분: 현재 토큰까지의 누적합 평균으로 정규화
+        seq_len = tf.shape(score)[1]
+        # [1, 2, 3, ..., L]을 D_model 차원으로 확장
+        count_for_mean = tf.cast(tf.range(seq_len) + 1, score.dtype)
+        count_for_mean = tf.reshape(count_for_mean, (1, seq_len, 1))
+        # 누적합을 현재까지의 토큰 개수로 나누어 평균 누적합 계산 (B, L, D)
+        score_mean = score / count_for_mean
+        # 정규화 분모 설정
+        denom = tf.maximum(score_mean, self.eps)
+        score_norm = score / denom
+        # -----------------------------------------------
+        score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value)
+        x_comb = score_clipped * V
         out = self.norm(x_comb + residual)
         out = self.glu(out)
         return tf.cast(out, x.dtype)
+class Lo(layers.Layer):
+    def __init__(self, d_model):
         super().__init__()
+        self.d = layers.Dense(64, activation='silu')
+        self.w = layers.Dense(d_model)
+        self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
+    def call(self, x):
+        p = self.d(x)
+        p = self.w(p)
+        return self.norm(p) + x
+class Block(layers.Layer):
+    def __init__(self, d_model):
+        super().__init__()
+        self.lou = LoU(d_model)
+        self.lo = Lo(d_model)
+    def call(self, x):
+        x = self.lou(x)
+        x = self.lo(x)
+        return x
+class ReLM(tf.keras.Model):
+    def __init__(self, vocab_size, max_seq_len, d_model, n_layers, dropout_rate=0.1):
+        super().__init__()
+        self.token_embedding = layers.Embedding(vocab_size, d_model)
+        self.pos_embedding = layers.Embedding(max_seq_len, d_model)
+        self.blocks = [Block(d_model) for _ in range(n_layers)]
+        self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype="float32")
+    def call(self, x, training=False):
+        batch_size, seq_len = tf.shape(x)[0], tf.shape(x)[1]
+        positions = tf.range(seq_len)[tf.newaxis, :]
+        x = self.token_embedding(x) + self.pos_embedding(positions)
+        for block in self.blocks:
+            x = block(x)
+        x = self.ln_f(x)
+        embedding_matrix = tf.cast(self.token_embedding.embeddings, x.dtype)
+        logits = tf.matmul(x, embedding_matrix, transpose_b=True)
+        return tf.cast(logits, tf.float32)
+model = ReLM(
+    vocab_size=vocab_size,
+    max_seq_len=max_len,
+    d_model=256,
+    n_layers=1
+)
 dummy_input = {
         "enc_inputs": tf.zeros((1, max_len), dtype=tf.int32),
         "dec_inputs": tf.zeros((1, max_len), dtype=tf.int32)