Ragdoll / models /decoder.py
abhaskumarsinha's picture
Added the alpha version of Corpus2GPT
ceed47a
raw
history blame
2.15 kB
import keras
from models.attention import Attention, AttentionTrain
class Decoder(keras.layers.Layer):
"""
Decoder layer in a Transformer model architecture.
This layer implements the decoder component of the Transformer model, which is responsible for generating
the output sequence based on the encoded input sequence and previously generated output tokens.
Parameters:
attention (keras.layers.Layer): Attention layer for the decoder.
dropout_rate (float): Dropout rate applied to the outputs of each sub-layer. Default is 0.2.
"""
def __init__(self, dropout_rate=0.2, num_heads = 32, head_dims = 40, fc_dim_factor = 5, input_len = 64):
"""
Initializes the Decoder layer.
Args:
attention (keras.layers.Layer): Attention layer for the decoder.
dropout_rate (float): Dropout rate applied to the outputs of each sub-layer. Default is 0.2.
"""
super().__init__()
# Layer Normalization for the first sub-layer
self.norm1 = keras.layers.LayerNormalization(epsilon=1e-9)
# Layer Normalization for the second sub-layer
self.norm2 = keras.layers.LayerNormalization(epsilon=1e-9)
# Attention mechanism
self.attn = AttentionTrain(head_dims=head_dims, num_heads=num_heads, dropout=dropout_rate, input_len = input_len)
# Dense layer for the first feed-forward sub-layer
self.fc1 = keras.layers.Dense(num_heads * head_dims * fc_dim_factor, activation='gelu')
# Dense layer for the second feed-forward sub-layer
self.fc2 = keras.layers.Dense(num_heads * head_dims, activation='gelu')
# Dropout layers
self.dropout1 = keras.layers.Dropout(dropout_rate)
self.dropout2 = keras.layers.Dropout(dropout_rate)
self._config = {'dropout_rate': dropout_rate}
def call(self, inputs):
x = inputs
x = self.attn(x)
x = self.dropout1(x)
out1 = self.norm1(x + inputs)
x = out1
out1 = self.fc2(self.fc1(out1))
out1 = self.dropout2(out1)
return self.norm2(out1 + x)