Rzoro commited on
Commit
0f53151
1 Parent(s): e20e87d

Add trained transformer model

Browse files
__init__.py ADDED
File without changes
main.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # torch packages
2
+ import torch
3
+ from model.transformer import Transformer
4
+ import json
5
+
6
+ if __name__ == "__main__":
7
+ """
8
+ Following parameters are for Multi30K dataset
9
+ """
10
+ # Load config containing model input parameters
11
+ with open('params.json') as json_data:
12
+ config = json.load(json_data)
13
+ print(config)
14
+
15
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
16
+ # Instantiate model
17
+ model = Transformer(
18
+ config["dk"],
19
+ config["dv"],
20
+ config["h"],
21
+ config["src_vocab_size"],
22
+ config["target_vocab_size"],
23
+ config["num_encoders"],
24
+ config["num_decoders"],
25
+ config["dim_multiplier"],
26
+ config["pdropout"],
27
+ device = device)
28
+ # Load model weights
29
+ model.load_state_dict(torch.load('pytorch_transformer_model.pt',
30
+ map_location=device))
31
+ print(model)
32
+
model/__init__.py ADDED
File without changes
model/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (169 Bytes). View file
 
model/__pycache__/decoder.cpython-310.pyc ADDED
Binary file (3.65 kB). View file
 
model/__pycache__/encoder.cpython-310.pyc ADDED
Binary file (2.76 kB). View file
 
model/__pycache__/sublayers.cpython-310.pyc ADDED
Binary file (6.17 kB). View file
 
model/__pycache__/transformer.cpython-310.pyc ADDED
Binary file (4.4 kB). View file
 
model/decoder.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import copy
3
+ import time
4
+ import random
5
+ import spacy
6
+ import numpy as np
7
+ import os
8
+
9
+ # torch packages
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ from torch import Tensor
14
+ import torch.optim as optim
15
+
16
+ from model.sublayers import (
17
+ MultiHeadAttention,
18
+ PositionalEncoding,
19
+ PositionwiseFeedForward,
20
+ Embedding)
21
+
22
+
23
+ class DecoderLayer(nn.Module):
24
+ def __init__(
25
+ self,
26
+ dk,
27
+ dv,
28
+ h,
29
+ dim_multiplier = 4,
30
+ pdropout = 0.1):
31
+ super().__init__()
32
+
33
+ # Reference page 5 chapter 3.2.2 Multi-head attention
34
+ dmodel = dk*h
35
+ # Reference page 5 chapter 3.3 positionwise FeedForward
36
+ dff = dmodel * dim_multiplier
37
+
38
+ # Masked Multi Head Attention
39
+ self.masked_attention = MultiHeadAttention(dk, dv, h, pdropout)
40
+ self.masked_attn_norm = nn.LayerNorm(dmodel)
41
+
42
+ # Multi head attention
43
+ self.attention = MultiHeadAttention(dk, dv, h, pdropout)
44
+ self.attn_norm = nn.LayerNorm(dmodel)
45
+
46
+ # Add position FeedForward Network
47
+ self.ff = PositionwiseFeedForward(dmodel, dff, pdropout=pdropout)
48
+ self.ff_norm = nn.LayerNorm(dmodel)
49
+
50
+ self.dropout = nn.Dropout(p = pdropout)
51
+
52
+ def forward(self,
53
+ trg: Tensor,
54
+ src: Tensor,
55
+ trg_mask: Tensor,
56
+ src_mask: Tensor):
57
+ """
58
+ Args:
59
+ trg: embedded sequences (batch_size, trg_seq_length, d_model)
60
+ src: embedded sequences (batch_size, src_seq_length, d_model)
61
+ trg_mask: mask for the sequences (batch_size, 1, trg_seq_length, trg_seq_length)
62
+ src_mask: mask for the sequences (batch_size, 1, 1, src_seq_length)
63
+
64
+ Returns:
65
+ trg: sequences after self-attention (batch_size, trg_seq_length, d_model)
66
+ attn_probs: self-attention softmax scores (batch_size, n_heads, trg_seq_length, src_seq_length)
67
+ """
68
+ _trg, attn_probs = self.masked_attention(
69
+ query = trg,
70
+ key = trg,
71
+ val = trg,
72
+ mask = trg_mask)
73
+
74
+ # Residual connection between input and sublayer output, details: Page 7, Chapter 5.4 "Regularization",
75
+ # Actual paper design is the following
76
+ trg = self.masked_attn_norm(trg + self.dropout(_trg))
77
+
78
+ # Inputs to the decoder attention is given as follows
79
+ # query = previous decoder layer
80
+ # key and val = output of encoder
81
+ # mask = src_mask
82
+ # Reference : page 5 chapter 3.2.3 point 1
83
+ _trg, attn_probs = self.attention(
84
+ query = trg,
85
+ key = src,
86
+ val = src,
87
+ mask = src_mask)
88
+ trg = self.attn_norm(trg + self.dropout(_trg))
89
+
90
+ # position-wise feed-forward network
91
+ _trg = self.ff(trg)
92
+ # Perform Add Norm again
93
+ trg = self.ff_norm(trg + self.dropout(_trg))
94
+ return trg, attn_probs
95
+
96
+
97
+ class Decoder(nn.Module):
98
+ def __init__(
99
+ self,
100
+ dk,
101
+ dv,
102
+ h,
103
+ num_decoders,
104
+ dim_multiplier = 4,
105
+ pdropout=0.1):
106
+ super().__init__()
107
+ self.decoder_layers = nn.ModuleList([
108
+ DecoderLayer(dk,
109
+ dv,
110
+ h,
111
+ dim_multiplier,
112
+ pdropout) for _ in range(num_decoders)
113
+ ])
114
+
115
+ def forward(self, target_inputs, src_inputs, target_mask, src_mask):
116
+ """
117
+ Input from the Embedding layer
118
+ target_inputs = embedded sequences (batch_size, trg_seq_length, d_model)
119
+ src_inputs = embedded sequences (batch_size, src_seq_length, d_model)
120
+ target_mask = mask for the sequences (batch_size, 1, trg_seq_length, trg_seq_length)
121
+ src_mask = mask for the sequences (batch_size, 1, 1, src_seq_length)
122
+ """
123
+ target_representation = target_inputs
124
+
125
+ # Forward pass through decoder stack
126
+ for layer in self.decoder_layers:
127
+ target_representation, attn_probs = layer(
128
+ target_representation,
129
+ src_inputs,
130
+ target_mask,
131
+ src_mask)
132
+ self.attn_probs = attn_probs
133
+ return target_representation
134
+
135
+
model/encoder.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import copy
3
+ import time
4
+ import random
5
+ import spacy
6
+ import numpy as np
7
+ import os
8
+
9
+ # torch packages
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ from torch import Tensor
14
+ import torch.optim as optim
15
+
16
+ from model.sublayers import (
17
+ MultiHeadAttention,
18
+ PositionalEncoding,
19
+ PositionwiseFeedForward,
20
+ Embedding)
21
+
22
+
23
+ class EncoderLayer(nn.Module):
24
+ """
25
+ This building block in the encoder layer consists of the following
26
+ 1. MultiHead Attention
27
+ 2. Sublayer Logic
28
+ 3. Positional FeedForward Network
29
+ """
30
+ def __init__(self, dk, dv, h, dim_multiplier = 4, pdropout=0.1):
31
+ super().__init__()
32
+ self.attention = MultiHeadAttention(dk, dv, h, pdropout)
33
+ # Reference page 5 chapter 3.2.2 Multi-head attention
34
+ dmodel = dk*h
35
+ # Reference page 5 chapter 3.3 positionwise FeedForward
36
+ dff = dmodel * dim_multiplier
37
+ self.attn_norm = nn.LayerNorm(dmodel)
38
+ self.ff = PositionwiseFeedForward(dmodel, dff, pdropout=pdropout)
39
+ self.ff_norm = nn.LayerNorm(dmodel)
40
+
41
+ self.dropout = nn.Dropout(p = pdropout)
42
+
43
+ def forward(self, src_inputs, src_mask=None):
44
+ """
45
+ Forward pass as per page 3 chapter 3.1
46
+ """
47
+ mha_out, attention_wts = self.attention(
48
+ query = src_inputs,
49
+ key = src_inputs,
50
+ val = src_inputs,
51
+ mask = src_mask)
52
+
53
+ # Residual connection between input and sublayer output, details: Page 7, Chapter 5.4 "Regularization",
54
+ # Actual paper design is the following
55
+ intermediate_out = self.attn_norm(src_inputs + self.dropout(mha_out))
56
+
57
+ pff_out = self.ff(intermediate_out)
58
+
59
+ # Perform Add Norm again
60
+ out = self.ff_norm(intermediate_out + self.dropout(pff_out))
61
+ return out, attention_wts
62
+
63
+
64
+ class Encoder(nn.Module):
65
+ def __init__(self, dk, dv, h, num_encoders, dim_multiplier = 4, pdropout=0.1):
66
+ super().__init__()
67
+ self.encoder_layers = nn.ModuleList([
68
+ EncoderLayer(dk,
69
+ dv,
70
+ h,
71
+ dim_multiplier,
72
+ pdropout) for _ in range(num_encoders)
73
+ ])
74
+
75
+ def forward(self, src_inputs, src_mask = None):
76
+ """
77
+ Input from the Embedding layer
78
+ src_inputs = (B - batch size, S/T - max token sequence length, D- model dimension)
79
+ """
80
+ src_representation = src_inputs
81
+
82
+ # Forward pass through encoder stack
83
+ for enc in self.encoder_layers:
84
+ src_representation, attn_probs = enc(src_representation, src_mask)
85
+
86
+ self.attn_probs = attn_probs
87
+ return src_representation
model/sublayers.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # importing required libraries
2
+ import math
3
+ import copy
4
+ import time
5
+ import random
6
+ import spacy
7
+ import numpy as np
8
+ import os
9
+
10
+ # torch packages
11
+ import torch
12
+ import torch.nn as nn
13
+ import torch.nn.functional as F
14
+ from torch import Tensor
15
+ import torch.optim as optim
16
+
17
+ class MultiHeadAttention(nn.Module):
18
+ """
19
+ We can refer to the following blog to understand in depth about the transformer and MHA
20
+ https://medium.com/@hunter-j-phillips/multi-head-attention-7924371d477a
21
+
22
+ Here we are clubbing all the linear layers together and duplicating the inputs and
23
+ then performing matrix multiplications
24
+ """
25
+ def __init__(self, dk, dv, h, pdropout=0.1):
26
+ """
27
+ Input Args:
28
+
29
+ dk(int): Key dimensions used for generating Key weight matrix
30
+ dv(int): Val dimensions used for generating val weight matrix
31
+ h(int) : Number of heads in MHA
32
+ """
33
+ super().__init__()
34
+ assert dk == dv
35
+ self.dk = dk
36
+ self.dv = dv
37
+ self.h = h
38
+ self.dmodel = self.dk * self.h # model dimension
39
+
40
+ # Add the params in modulelist as the params in the conv list needs to be tracked
41
+ # wq, wk, wv -> multiple linear weights for the number of heads
42
+ self.WQ = nn.Linear(self.dmodel, self.dmodel) # shape -> (dmodel, dmodel)
43
+ self.WK = nn.Linear(self.dmodel, self.dmodel) # shape -> (dmodel, dmodel)
44
+ self.WV = nn.Linear(self.dmodel, self.dmodel) # shape -> (dmodel, dmodel)
45
+ # Output Weights
46
+ self.WO = nn.Linear(self.h*self.dv, self.dmodel) # shape -> (dmodel, dmodel)
47
+ self.softmax = nn.Softmax(dim=-1)
48
+ self.dropout = nn.Dropout(p = pdropout)
49
+
50
+ def forward(self, query, key, val, mask=None):
51
+ """
52
+ Forward pass for MHA
53
+
54
+ X has a size of (batch_size, seq_length, d_model)
55
+ Wq, Wk, and Wv have a size of (d_model, d_model)
56
+
57
+ Perform Scaled Dot Product Attention on multi head attention.
58
+
59
+ Notation: B - batch size, S/T - max src/trg token-sequence length
60
+ query shape = (B, S, dmodel)
61
+ key shape = (B, S, dmodel)
62
+ val shape = (B, S, dmodel)
63
+ """
64
+ # Weight the queries
65
+ Q = self.WQ(query) # shape -> (B, S, dmodel)
66
+ K = self.WK(key) # shape -> (B, S, dmodel)
67
+ V = self.WV(val) # shape -> (B, S, dmodel)
68
+
69
+ # Separate last dimension to number of head and dk
70
+ batch_size = Q.size(0)
71
+ Q = Q.view(batch_size, -1, self.h, self.dk) # shape -> (B, S, h, dk)
72
+ K = K.view(batch_size, -1, self.h, self.dk) # shape -> (B, S, h, dk)
73
+ V = V.view(batch_size, -1, self.h, self.dk) # shape -> (B, S, h, dk)
74
+
75
+ # each sequence is split across n_heads, with each head receiving seq_length tokens
76
+ # with d_key elements in each token instead of d_model.
77
+ Q = Q.permute(0, 2, 1, 3) # shape -> (B, h, S, dk)
78
+ K = K.permute(0, 2, 1, 3) # shape -> (B, h, S, dk)
79
+ V = V.permute(0, 2, 1, 3) # shape -> (B, h, S, dk)
80
+
81
+ # dot product of Q and K
82
+ scaled_dot_product = torch.matmul(Q, K.permute(0, 1, 3, 2)) / math.sqrt(self.dk)
83
+
84
+ # fill those positions of product as (-1e10) where mask positions are 0
85
+ if mask is not None:
86
+ scaled_dot_product = scaled_dot_product.masked_fill(mask == 0, -1e10)
87
+
88
+ attn_probs = self.softmax(scaled_dot_product)
89
+
90
+ # Create head
91
+ head = torch.matmul(self.dropout(attn_probs), V) # shape -> (B, h, S, S) * (B, h, S, dk) = (B, h, S, dk)
92
+ # Prepare the head to pass it through output linear layer
93
+ head = head.permute(0, 2, 1, 3).contiguous() # shape -> (B, S, h, dk)
94
+ # Concatenate the head together
95
+ head = head.view(batch_size, -1, self.h* self.dk) # shape -> (B, S, (h*dk = dmodel))
96
+ # Pass through output layer
97
+ token_representation = self.WO(head)
98
+ return token_representation, attn_probs
99
+
100
+
101
+ class Embedding(nn.Module):
102
+ """
103
+ Embedding lookup table which is used by the positional
104
+ embedding block.
105
+ Embedding lookup table is shared across input and output
106
+ """
107
+ def __init__(self, vocab_size, dmodel):
108
+ """
109
+ Embedding lookup needs a vocab size and model
110
+ dimension size matrix for creating lookups
111
+ """
112
+ super().__init__()
113
+ self.embedding_lookup = nn.Embedding(vocab_size, dmodel)
114
+ self.vocab_size = vocab_size
115
+ self.dmodel = dmodel
116
+
117
+ def forward(self, token_ids):
118
+ """
119
+ For a given token lookup the embedding vector
120
+
121
+ As per the paper, we also multiply the embedding vector with sqrt of dmodel
122
+ """
123
+ assert token_ids.ndim == 2, \
124
+ f'Expected: (batch size, max token sequence length), got {token_ids.shape}'
125
+
126
+ embedding_vector = self.embedding_lookup(token_ids)
127
+
128
+ return embedding_vector * math.sqrt(self.dmodel)
129
+
130
+
131
+ class PositionalEncoding(nn.Module):
132
+ def __init__(self, dmodel, max_seq_length = 5000, pdropout = 0.1,):
133
+ """
134
+ dmodel(int): model dimensions
135
+ max_seq_length(int): Maximum input sequence length
136
+ pdropout(float): Dropout probability
137
+ """
138
+ super().__init__()
139
+ self.dropout = nn.Dropout(p = pdropout)
140
+
141
+ # Calculate frequencies
142
+ position_ids = torch.arange(0, max_seq_length).unsqueeze(1)
143
+ # -ve sign is added because the exponents are inverted when you multiply position and frequencies
144
+ frequencies = torch.pow(10000, -torch.arange(0, dmodel, 2, dtype = torch.float)/ dmodel)
145
+
146
+ # Create positional encoding table
147
+ positional_encoding_table = torch.zeros(max_seq_length, dmodel)
148
+ # Fill the table with even entries with sin and odd entries with cosine
149
+ positional_encoding_table[:, 0::2] = torch.sin(position_ids * frequencies)
150
+ positional_encoding_table[:, 1::2] = torch.cos(position_ids * frequencies)
151
+
152
+ # Registering the position enconding in state_dict but the its not included
153
+ # in named parameter as it is not trainable
154
+ self.register_buffer("positional_encoding_table", positional_encoding_table)
155
+
156
+ def forward(self, embeddings_batch):
157
+ """
158
+ embeddings_batch shape = (batch size, seq_length, dmodel)
159
+ positional_encoding_table shape = (max_seq_length, dmodel)
160
+ """
161
+ assert embeddings_batch.ndim == 3, \
162
+ f"Embeddings batch should have dimension of 3 but got {embeddings_batch.ndim}"
163
+ assert embeddings_batch.size()[-1] == self.positional_encoding_table.size()[-1], \
164
+ f"Embedding batch shape and positional_encoding_table shape should match, expected Embedding batch shape : {embeddings_batch.shape[-1]} while positional_encoding_table shape : {self.positional_encoding_table[-1]}"
165
+
166
+ # Get encodings for the given input sequence length
167
+ pos_encodings = self.positional_encoding_table[:embeddings_batch.shape[1]] # Choose only seq_length out of max_seq_length
168
+
169
+ # Final output
170
+ out = embeddings_batch + pos_encodings
171
+ out = self.dropout(out)
172
+ return out
173
+
174
+
175
+ class PositionwiseFeedForward(nn.Module):
176
+ def __init__(self, dmodel, dff, pdropout = 0.1):
177
+ super().__init__()
178
+
179
+ self.dropout = nn.Dropout(p = pdropout)
180
+
181
+ self.W1 = nn.Linear(dmodel, dff) # Intermediate layer
182
+ self.W2 = nn.Linear(dff, dmodel) # Output layer
183
+
184
+ self.relu = nn.ReLU()
185
+
186
+ def forward(self, x):
187
+ """
188
+ Perform Feedforward calculation
189
+
190
+ x shape = (B - batch size, S/T - max token sequence length, D- model dimension).
191
+ """
192
+ out = self.W2(self.relu(self.dropout(self.W1(x))))
193
+ return out
194
+
model/transformer.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import copy
3
+ import time
4
+ import random
5
+ import spacy
6
+ import numpy as np
7
+ import os
8
+
9
+ # torch packages
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ from torch import Tensor
14
+ import torch.optim as optim
15
+
16
+ from model.sublayers import (
17
+ MultiHeadAttention,
18
+ PositionalEncoding,
19
+ PositionwiseFeedForward,
20
+ Embedding)
21
+
22
+ from model.encoder import Encoder
23
+ from model.decoder import Decoder
24
+
25
+
26
+ class Transformer(nn.Module):
27
+ def __init__(self,
28
+ dk,
29
+ dv,
30
+ h,
31
+ src_vocab_size,
32
+ target_vocab_size,
33
+ num_encoders,
34
+ num_decoders,
35
+ src_pad_idx,
36
+ target_pad_idx,
37
+ dim_multiplier = 4,
38
+ pdropout=0.1,
39
+ device = "cpu"
40
+ ):
41
+ super().__init__()
42
+
43
+ # Reference page 5 chapter 3.2.2 Multi-head attention
44
+ dmodel = dk*h
45
+ # Modules required to build Encoder
46
+ self.src_embeddings = Embedding(src_vocab_size, dmodel)
47
+ self.src_positional_encoding = PositionalEncoding(
48
+ dmodel,
49
+ max_seq_length = src_vocab_size,
50
+ pdropout = pdropout
51
+ )
52
+ self.encoder = Encoder(
53
+ dk,
54
+ dv,
55
+ h,
56
+ num_encoders,
57
+ dim_multiplier=dim_multiplier,
58
+ pdropout=pdropout)
59
+
60
+ # Modules required to build Decoder
61
+ self.target_embeddings = Embedding(target_vocab_size, dmodel)
62
+ self.target_positional_encoding = PositionalEncoding(
63
+ dmodel,
64
+ max_seq_length = target_vocab_size,
65
+ pdropout = pdropout
66
+ )
67
+ self.decoder = Decoder(
68
+ dk,
69
+ dv,
70
+ h,
71
+ num_decoders,
72
+ dim_multiplier=4,
73
+ pdropout=0.1)
74
+
75
+ # Final output
76
+ self.linear = nn.Linear(dmodel, target_vocab_size)
77
+ # self.softmax = nn.Softmax(dim=-1)
78
+ self.device = device
79
+ self.src_pad_idx = src_pad_idx
80
+ self.target_pad_idx = target_pad_idx
81
+ self.init_params()
82
+
83
+ # This part wasn't mentioned in the paper, but it's super important!
84
+ def init_params(self):
85
+ """
86
+ xavier has tremendous impact! I didn't expect
87
+ that the model's perf, with normalization layers,
88
+ is so dependent on the choice of weight initialization.
89
+ """
90
+ for name, p in self.named_parameters():
91
+ if p.dim() > 1:
92
+ nn.init.xavier_uniform_(p)
93
+
94
+ def make_src_mask(self, src):
95
+ """
96
+ Args:
97
+ src: raw sequences with padding (batch_size, seq_length)
98
+ src_pad_idx(int): index where the token need not be attended
99
+
100
+ Returns:
101
+ src_mask: mask for each sequence (batch_size, 1, 1, seq_length)
102
+ """
103
+ batch_size = src.shape[0]
104
+ # assign 1 to tokens that need attended to and 0 to padding tokens,
105
+ # then add 2 dimensions
106
+ src_mask = (src != self.src_pad_idx).view(batch_size, 1, 1, -1)
107
+ return src_mask
108
+
109
+ def make_target_mask(self, target):
110
+ """
111
+ Args:
112
+ target: raw sequences with padding (batch_size, seq_length)
113
+ target_pad_idx(int): index where the token need not be attended
114
+
115
+ Returns:
116
+ target_mask: mask for each sequence (batch_size, 1, seq_length, seq_length)
117
+ """
118
+
119
+ seq_length = target.shape[1]
120
+ batch_size = target.shape[0]
121
+
122
+ # assign True to tokens that need attended to and
123
+ # False to padding tokens, then add 2 dimensions
124
+ target_mask = (target != self.target_pad_idx).view(batch_size, 1, 1, -1) # (batch_size, 1, 1, seq_length)
125
+
126
+ # generate subsequent mask
127
+ trg_sub_mask = torch.tril(torch.ones((seq_length, seq_length), device=self.device)).bool() # (batch_size, 1, seq_length, seq_length)
128
+
129
+ # bitwise "and" operator | 0 & 0 = 0, 1 & 1 = 1, 1 & 0 = 0
130
+ target_mask = target_mask & trg_sub_mask
131
+
132
+ return target_mask
133
+
134
+ def forward(
135
+ self,
136
+ src_token_ids_batch,
137
+ target_token_ids_batch):
138
+
139
+ # create source and target masks
140
+ src_mask = self.make_src_mask(
141
+ src_token_ids_batch) # (batch_size, 1, 1, src_seq_length)
142
+ target_mask = self.make_target_mask(
143
+ target_token_ids_batch) # (batch_size, 1, trg_seq_length, trg_seq_length)
144
+
145
+ # Create embeddings
146
+ src_representations = self.src_embeddings(src_token_ids_batch)
147
+ src_representations = self.src_positional_encoding(src_representations)
148
+
149
+ target_representations = self.target_embeddings(target_token_ids_batch)
150
+ target_representations = self.target_positional_encoding(target_representations)
151
+
152
+ # Encode
153
+ encoded_src = self.encoder(src_representations, src_mask)
154
+
155
+ # Decode
156
+ decoded_output = self.decoder(
157
+ target_representations,
158
+ encoded_src,
159
+ target_mask,
160
+ src_mask)
161
+
162
+ # Post processing
163
+ out = self.linear(decoded_output)
164
+ # Don't use softmax as we are not comparing against softmaxed output while
165
+ # computing loss. We are comparing against linear outputs
166
+ # # Output
167
+ # out = self.softmax(out)
168
+ return out
169
+
170
+ def count_parameters(model):
171
+ return sum(p.numel() for p in model.parameters() if p.requires_grad)
172
+
173
+ if __name__ == "__main__":
174
+ """
175
+ Following parameters are for Multi30K dataset
176
+ """
177
+ dk = 32
178
+ dv = 32
179
+ h = 8
180
+ src_vocab_size = 7983
181
+ target_vocab_size = 5979
182
+ src_pad_idx = 2
183
+ target_pad_idx = 2
184
+ num_encoders = 3
185
+ num_decoders = 3
186
+ dim_multiplier = 4
187
+ pdropout=0.1
188
+ # print(111)
189
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
190
+ model = Transformer(
191
+ dk,
192
+ dv,
193
+ h,
194
+ src_vocab_size,
195
+ target_vocab_size,
196
+ num_encoders,
197
+ num_decoders,
198
+ dim_multiplier,
199
+ pdropout,
200
+ device = device)
201
+ if torch.cuda.is_available():
202
+ model.cuda()
203
+ print(model)
204
+ print(f'The model has {count_parameters(model):,} trainable parameters')
205
+
params.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dk": 32, "dv": 32, "h": 8, "src_vocab_size": 8500, "target_vocab_size": 6500, "src_pad_idx": 2, "target_pad_idx": 2, "num_encoders": 3, "num_decoders": 3, "dim_multiplier": 4, "pdropout": 0.1, "lr": 0.0003, "N_EPOCHS": 50, "CLIP": 1, "patience": 5}
trained_model/transformer-model.pt → pytorch_transformer_model.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c72ccefd0a3594899f7f6e4d0266c74d18497b51e953261d4f678855a863258
3
- size 56911669
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bec7a1a3b8371fa8260fcfc9204e6695714f221cd54f121503e6241e31def867
3
+ size 59573843
vocab.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:457ebb2e34df81149998f2fa2bfe6b7c3aac3964beff79b3dd24057c48341cb4
3
+ size 249451