ftakelait commited on
Commit
b1c0f8d
1 Parent(s): 23a7f71

Add application files

Browse files
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ from cryptography.utils import CryptographyDeprecationWarning
3
+
4
+ with warnings.catch_warnings():
5
+ warnings.filterwarnings('ignore', category=CryptographyDeprecationWarning)
6
+ import paramiko
7
+
8
+ import gradio as gr
9
+ #from transformers import pipeline
10
+ from transformers import PreTrainedTokenizerFast, AutoTokenizer
11
+ from transformers import PreTrainedTokenizerFast
12
+ from transformer_mt.modeling_transformer import TransfomerEncoderDecoderModel
13
+ from transformer_mt_roberta.modeling_transformer_final import TransfomerEncoderDecoderModel as mt_roberta
14
+
15
+ #translation_pipeline = pipeline('translation_en_to_fr')
16
+
17
+ # seting up translation transformer into Gradio
18
+ #def translator_fn(text_input):
19
+ # results = translation_pipeline(text_input)
20
+ # return results[0]['translation_text']
21
+
22
+ # def translator_fn_baseline(text_in):
23
+ # source_tokenizer = PreTrainedTokenizerFast.from_pretrained("da_en_output_dir/da_tokenizer")
24
+ # target_tokenizer = PreTrainedTokenizerFast.from_pretrained("da_en_output_dir/en_tokenizer")
25
+ # model = TransfomerEncoderDecoderModel.from_pretrained("da_en_output_dir")
26
+ <<<<<<< HEAD
27
+ #
28
+ =======
29
+ #
30
+ >>>>>>> adb80531e202c58b4ab91375bc391ab50bbc882f
31
+ # input_ids = source_tokenizer.encode(text_in, return_tensors="pt")
32
+ # output_ids = model.generate(
33
+ # input_ids,
34
+ # max_length=10,
35
+ # bos_token_id=target_tokenizer.bos_token_id,
36
+ # eos_token_id=target_tokenizer.eos_token_id,
37
+ # pad_token_id=target_tokenizer.pad_token_id,
38
+ # )
39
+ <<<<<<< HEAD
40
+ #
41
+ =======
42
+ #
43
+ >>>>>>> adb80531e202c58b4ab91375bc391ab50bbc882f
44
+ # return target_tokenizer.decode(output_ids[0])
45
+
46
+ def translator_fn_roberta(text_in):
47
+ source_tokenizer_pretrained_roberta = AutoTokenizer.from_pretrained("flax-community/roberta-base-danish")
48
+ target_tokenizer_pretrained_roberta = PreTrainedTokenizerFast.from_pretrained("da_en_output_dir/en_tokenizer")
49
+ model_pretrained_roberta = mt_roberta.from_pretrained("da_en_RoBERTa_pretrained")
50
+
51
+ input_ids_pretrained_roberta = source_tokenizer_pretrained_roberta.encode(text_in, return_tensors="pt")
52
+ output_ids_pretrained_roberta = input_ids_pretrained_roberta.generate(
53
+ input_ids_pretrained_roberta,
54
+ max_length=10,
55
+ bos_token_id=target_tokenizer_pretrained_roberta.bos_token_id,
56
+ eos_token_id=target_tokenizer_pretrained_roberta.eos_token_id,
57
+ pad_token_id=target_tokenizer_pretrained_roberta.pad_token_id,
58
+ )
59
+ return target_tokenizer_pretrained_roberta.decode(output_ids_pretrained_roberta[0])
60
+
61
+ iface = gr.Interface(fn=translator_fn_roberta,
62
+ inputs=gr.inputs.Textbox(lines=2, placeholder=None, label="Your Danish text goes here."),
63
+ outputs=['text'], # a list should match the number of values returned by fn to have one input and 2 putputs.
64
+ description = "This App translates text from Danish to the English language.",
65
+ title = "Danish to English Translator App",
66
+ theme = "peach")
67
+
68
+ iface.launch(share=False, enable_queue=True)
da_en_RoBERTa_pretrained/en_tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]"}
da_en_RoBERTa_pretrained/en_tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
da_en_RoBERTa_pretrained/en_tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]", "tokenizer_class": "PreTrainedTokenizerFast"}
da_en_RoBERTa_pretrained/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43e9463469dfeb0d2c5fed75b6181ec570e95fda4c6565c6f80387782f1aa618
3
+ size 885137451
da_en_RoBERTa_pretrained/model_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"num_layers": 6, "hidden": 768, "num_heads": 8, "fcn_hidden": 2048, "src_vocab_size": 32000, "tgt_vocab_size": 32000, "max_seq_len": 128, "dropout": 0.1}
da_en_output_dir/da_tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]"}
da_en_output_dir/da_tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
da_en_output_dir/da_tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]", "tokenizer_class": "PreTrainedTokenizerFast"}
da_en_output_dir/en_tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]"}
da_en_output_dir/en_tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
da_en_output_dir/en_tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]", "tokenizer_class": "PreTrainedTokenizerFast"}
da_en_output_dir/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d93af21df63a573aac135ee8e6a3e984424471f07e707a942f660be1854f1067
3
+ size 616931903
da_en_output_dir/model_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"num_layers": 6, "hidden": 768, "num_heads": 8, "fcn_hidden": 2048, "src_vocab_size": 32000, "tgt_vocab_size": 32000, "max_seq_len": 128, "dropout": 0.1}
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch >= 1.3
2
+ datasets >= 1.8.0
3
+ tokenizers
4
+ wandb
5
+ transformers
transformer_mt/__init__.py ADDED
File without changes
transformer_mt/modeling_attention.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2022 Vladislav Lialin and Namrata Shivagunde
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import torch
18
+ import torch.nn as nn
19
+
20
+
21
+ class MultiHeadAttention(nn.Module):
22
+ def __init__(self, input_size, hidden, num_heads, causal=False):
23
+ """Multi-head attention module which computes [softmax(xQ_h @ xK_h^T) @ xV: ...] @ U
24
+
25
+ Can work as both self-attention or cross-attention (if kv is provided to .forward).
26
+
27
+ Args:
28
+ causal: use causal masking (do not allow target to look to the future or current token of source)
29
+ """
30
+ if hidden % num_heads:
31
+ raise ValueError(f"hidden should be divisible by num_heads, "
32
+ f"but got hidden={hidden} and num_heads={num_heads}")
33
+ super().__init__()
34
+
35
+ self.k = nn.Linear(input_size, hidden)
36
+ self.q = nn.Linear(input_size, hidden)
37
+ self.v = nn.Linear(input_size, hidden)
38
+ self.mix = nn.Linear(hidden, hidden)
39
+
40
+ self.num_heads = num_heads
41
+ self.head_size = hidden // num_heads
42
+ self.scale = self.head_size ** 0.5
43
+ self.causal = causal # causal masking
44
+
45
+ def forward(self, q, kv=None, key_padding_mask=None, return_attention=False):
46
+ """[Softmax(source Q_1 @ target K_1^T) @ target V_1 : ... ) @ x V_heads] @ U
47
+
48
+ Performs self-attention if kv is not specified.
49
+ In this case, kv = q and kv_seq_len = query_seq_len.
50
+
51
+ Args:
52
+ q: FloatTensor[batch_size, query_seq_len, input_size]
53
+ kv (target) : optional, FloatTensor[batch_size, kv_seq_len, input_size]
54
+ key_padding_mask: BoolTensor[batch_size, kv_seq_len] 0 means unpadded, 1 means padded
55
+
56
+ Returns:
57
+ FloatTensor[batch_size, seq_len, hidden]
58
+ """
59
+
60
+ # Task 1.1 (1 point)
61
+ # Update this function with cross-attention mechanism
62
+ # If target is None, then target (kv) and source (q) will be same.
63
+ # Define k, q, v using self.k, self.q and self.v based on if the target exists or not
64
+ # Note : Please write shape of each tensor for each line of code
65
+ ## YOUR CODE STARTS HERE## ~ 2 lines code
66
+ k = self.k(kv) if kv!=None else self.k(q)
67
+ # print('k', k.shape, 'q', q.shape)
68
+ q = self.q(q)
69
+ v = self.v(kv) if kv!=None else self.v(q)
70
+ # print("KV", kv)
71
+
72
+ # YOUR CODE ENDS HERE
73
+
74
+ bs, attending_seq, _ = q.shape
75
+ attended_seq = k.shape[1]
76
+
77
+ # [b, s, h] -> [b, h, s] -> [b * heads, h / heads, s] -> [b * heads, s, h / heads]
78
+ k = k.transpose(1, 2).reshape(bs * self.num_heads, self.head_size, -1).transpose(1, 2).contiguous() # [batch * num_heads, seq, hidden / num_heads]
79
+ q = q.transpose(1, 2).reshape(bs * self.num_heads, self.head_size, -1).transpose(1, 2).contiguous()
80
+ v = v.transpose(1, 2).reshape(bs * self.num_heads, self.head_size, -1).transpose(1, 2).contiguous()
81
+
82
+ scores = q @ k.transpose(1, 2) / self.scale # [batch * num_heads, attending_seq, attended_seq]
83
+ assert scores.shape == (bs * self.num_heads, attending_seq, attended_seq)
84
+
85
+
86
+ if key_padding_mask is not None:
87
+ # Task 1.2 (1 point)
88
+ # Padding
89
+ # Set the scores corresponding to padded positions (key_padding_mask == 1) to -inf
90
+ #
91
+ # You might need to reshape the scores to [batch_size, seq_len, seq_len]
92
+ # in this case, remember to reshape them back
93
+ # Our implementation is 3 lines
94
+ # YOUR CODE STARTS HERE
95
+ # print(scores.shape, key_padding_mask.unsqueeze(-2).shape)
96
+
97
+
98
+ scores = scores.reshape(self.num_heads, bs, attending_seq, attended_seq)
99
+ scores_check = scores.reshape(bs, self.num_heads, attending_seq, -1)
100
+ # print("Socres:", scores.shape, "Scores_Check:", scores_check.shape)
101
+ # print('----')
102
+ scores = scores.masked_fill(key_padding_mask.unsqueeze(-2)==1, value = float("-inf"))
103
+ scores = scores.view(bs * self.num_heads, attending_seq, attended_seq)
104
+
105
+
106
+ # YOUR CODE ENDS HERE
107
+
108
+ assert scores.size() == (bs * self.num_heads, attending_seq, attended_seq),\
109
+ f"scores have wrong shape. Expected {(bs * self.num_heads, attending_seq, attended_seq)}, got {scores.size()}"
110
+
111
+ if self.causal:
112
+ causal_mask = torch.triu(torch.ones(attending_seq, attended_seq, dtype=torch.bool, device=scores.device), diagonal=1)
113
+ scores.masked_fill_(causal_mask.bool().unsqueeze(0), float("-inf"))
114
+
115
+ probs = torch.softmax(scores, dim=-1) # [batch * num_heads, tgt_seq, src_seq]
116
+ att = probs @ v # [batch * num_heads, tgt_seq, hidden / num_heads]
117
+
118
+ # [b * heads, s, h / heads] -> [b * heads, h / heads, s] -> [b, h, s] -> [b, s, h]
119
+ att = att.transpose(1, 2).reshape(bs, -1, attending_seq).transpose(1, 2).contiguous()
120
+
121
+ att = self.mix(att)
122
+
123
+ if return_attention:
124
+ return att, probs
125
+
126
+ return att
transformer_mt/modeling_transformer.py ADDED
@@ -0,0 +1,579 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2022 Vladislav Lialin and Namrata Shivagunde
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ import os
17
+ import json
18
+ from collections import namedtuple
19
+
20
+ import torch
21
+ import torch.nn as nn
22
+ import torch.nn.functional as F
23
+
24
+ from transformer_mt.modeling_attention import MultiHeadAttention
25
+ from transformer_mt.utils import pad
26
+
27
+
28
+ Hypothesis = namedtuple("Hypothesis", ["value", "score"])
29
+
30
+
31
+ class TransformerEncoderLayer(nn.Module):
32
+ def __init__(self, hidden, num_heads, fcn_hidden, dropout=0.0, causal=False):
33
+ super().__init__()
34
+
35
+ self.self_attention = MultiHeadAttention(
36
+ input_size=hidden,
37
+ hidden=hidden,
38
+ num_heads=num_heads,
39
+ causal=causal,
40
+ )
41
+ self.att_layer_norm = nn.LayerNorm(hidden)
42
+
43
+ self.fcn = nn.Sequential(
44
+ nn.Linear(hidden, fcn_hidden),
45
+ nn.ReLU(),
46
+ nn.Linear(fcn_hidden, hidden),
47
+ )
48
+ self.fcn_layer_norm = nn.LayerNorm(hidden)
49
+ self.dropout = nn.Dropout(dropout)
50
+
51
+ def forward(self, x, key_padding_mask=None):
52
+ """Self-Attention -> residual -> LayerNorm -> FCN -> residual -> LayerNorm
53
+
54
+ Args:
55
+ x: FloatTensor[batch_size, seq_len, input_size]
56
+
57
+ Returns:
58
+ FloatTensor[batch_size, seq_len, hidden]
59
+ """
60
+ # print('calling encode', key_padding_mask.shape)
61
+ residual = x
62
+ x = self.self_attention(x, key_padding_mask=key_padding_mask)
63
+ x = self.att_layer_norm(x + residual)
64
+
65
+ residual = x
66
+ x = self.fcn(x)
67
+ x = self.dropout(x)
68
+ x = self.fcn_layer_norm(x + residual)
69
+
70
+
71
+ return x
72
+
73
+
74
+ class TransformerDecoderLayer(nn.Module):
75
+ def __init__(self, hidden, num_heads, fcn_hidden, dropout=0.0):
76
+ super().__init__()
77
+
78
+ # Task 2.1 (1 point)
79
+ # Create layers needed for Transformer Decoder Layer
80
+ # 1. Create self.self_attention layer using MultiHeadAttention
81
+ # 2. Create self.cross_attention layer using MultiHeadAttention
82
+ # 2a. Which one of self_attention or cross_attention should have causal=True? Set it there.
83
+ # 3. Create self.att_layer_norm, self.cross_att_layer_norm, and self.fcn_layer_norm layers using LayerNorm
84
+ # 4. Create self.fcn network using nn.Sequential, nn.ReLU and nn.Linear
85
+ # 5. Create self.dropout layer using nn.Dropout
86
+ # YOUR CODE STARTS HERE (our implementation is about 5-8 lines)
87
+
88
+ self.self_attention = MultiHeadAttention(
89
+ input_size=hidden,
90
+ hidden=hidden,
91
+ num_heads=num_heads,
92
+ causal=True,
93
+ )
94
+
95
+ self.cross_attention = MultiHeadAttention(
96
+ input_size=hidden,
97
+ hidden=hidden,
98
+ num_heads=num_heads,
99
+ causal=False,
100
+ )
101
+
102
+ self.self_att_layer_norm = nn.LayerNorm(hidden)
103
+ self.cross_att_layer_norm = nn.LayerNorm(hidden)
104
+
105
+ self.fcn = nn.Sequential(
106
+ nn.Linear(hidden, fcn_hidden),
107
+ nn.ReLU(),
108
+ nn.Linear(fcn_hidden, hidden),
109
+ )
110
+ self.fcn_layer_norm = nn.LayerNorm(hidden)
111
+ self.dropout = nn.Dropout(dropout)
112
+
113
+ # YOUR CODE ENDS HERE
114
+
115
+ def forward(self, decoder_hidden_states, encoder_hidden_states, key_padding_mask=None):
116
+ """Transformer Decoder Layer
117
+
118
+ Args:
119
+ decoder_hidden_states: FloatTensor[batch_size, query_seq_len, hidden]
120
+ encoder_hidden_states: FloatTensor[batch_size, kv_seq_len, hidden]
121
+ key_padding_mask: ByteTensor[batch_size, kv_seq_len] with 1 for padded tokens and 0 for regular tokens
122
+
123
+ Returns:
124
+ FloatTensor[batch_size, query_seq_len, hidden]
125
+ """
126
+
127
+ # Task 2.2 (1 point)
128
+ # Implement Transformer decoder block
129
+ # Remember that transformer decoder block is composed of:
130
+ # 1. Self-Attention
131
+ # 2. Residual connection
132
+ # 3. LayerNorm
133
+ # 4. Cross-Attention
134
+ # 5. Residual connection
135
+ # 6. LayerNorm
136
+ # 7. Fully-Connected Layer
137
+ # 8. Dropout
138
+ # 9. Residual connection
139
+ # 10. LayerNorm
140
+ # Note : Please write shape of the tensor for each line of code
141
+ # YOUR CODE STARTS HERE (our implementation is about 10 lines)
142
+ # print('calling decode', "decoder hidden states:",decoder_hidden_states.shape, 'encoder_hidden_states:',encoder_hidden_states.shape, "key_oadding:",key_padding_mask.shape)
143
+ residual_1 = decoder_hidden_states
144
+ # print("calling_self attention for decoder")
145
+ out = self.self_attention(decoder_hidden_states, key_padding_mask=None)
146
+ out = self.self_att_layer_norm(residual_1 + out)
147
+ residual_2 = out
148
+ # print("calling_cross attention for decoder")
149
+ out = self.cross_attention(q = out, kv = encoder_hidden_states, key_padding_mask = key_padding_mask)
150
+ # print("out after cross", out.shape)
151
+ # print('----')
152
+ out = self.cross_att_layer_norm(out+residual_2)
153
+ out = self.fcn(out)
154
+ out = self.dropout(out)
155
+ residual_3 = out
156
+ out = self.fcn_layer_norm(out+residual_3)
157
+
158
+
159
+ ##YOUR CODE ENDS HERE##
160
+ return out
161
+
162
+
163
+ class TransfomerEncoderDecoderModel(nn.Module):
164
+ def __init__(
165
+ self,
166
+ *,
167
+ num_layers,
168
+ hidden,
169
+ num_heads,
170
+ fcn_hidden,
171
+ max_seq_len,
172
+ src_vocab_size,
173
+ tgt_vocab_size,
174
+ dropout=0.1,
175
+ ):
176
+ """A minimal implementation of Transformer Encoder Decoder Model
177
+
178
+ Args:
179
+ num_layer: number of layers for encoder and decoder (in total, model will have 2 * num_layers layers)
180
+ hidden : embedding size and hidden size of attentions
181
+ fcn_hidden: hidden size of fully-connected networks inside transformer layers
182
+ vocab_size: size of vocabulary
183
+ max_seq_len: maximum length of input, target sequence whichever is higher number
184
+ src_vocab_size : source voacb size
185
+ tgt_vocab_size : target voab size
186
+ """
187
+ super().__init__()
188
+ self.src_vocab_size = src_vocab_size
189
+ self.tgt_vocab_size = tgt_vocab_size
190
+ self.num_layers = num_layers
191
+ self.hidden = hidden
192
+ self.num_heads = num_heads
193
+ self.fcn_hidden = fcn_hidden
194
+ self.dropout_rate = dropout
195
+ self.max_seq_len = max_seq_len
196
+
197
+ # Task 2.3 (1 point)
198
+ # 1. Create encoder, decoder and positional embedding layer
199
+ # Use nn.Embedding for that and make sure to include source and target vocabulary size
200
+ # 2. Create a linear layer out_proj that will project contextualized representations
201
+ # of size hidden to your target vocabulary size.
202
+ # 3. Create a dropout layer
203
+ # YOUR CODE STARTS HERE (our implementation is about 5 lines)
204
+
205
+ self.encoder_embeddings = nn.Embedding(self.src_vocab_size, self.hidden)
206
+ self.decoder_embeddings = nn.Embedding(self.tgt_vocab_size, self.hidden)
207
+ self.positional_emb = nn.Embedding(self.max_seq_len, self.hidden)
208
+
209
+ self.out_proj = nn.Linear(self.hidden, self.tgt_vocab_size)
210
+
211
+ self.dropout = nn.Dropout(self.dropout_rate)
212
+ # YOUR CODE ENDS HERE
213
+
214
+ # Task 2.4 (1 point)
215
+ # 1. Create a list of encoder Layers
216
+ # 2. Create a list of decoder Layers
217
+ #
218
+ # Note that you need to wrap it with nn.ModuleList,
219
+ # so that the parameters of the layers would be counted as the paramertes of the model
220
+ # https://pytorch.org/docs/stable/generated/torch.nn.ModuleList.html
221
+ # Read more about ModuleList here:
222
+ # https://github.com/FrancescoSaverioZuppichini/Pytorch-how-and-when-to-use-Module-Sequential-ModuleList-and-ModuleDict
223
+ # You can use for-loop of python list comprehension to create the list of layers
224
+ #
225
+ # YOUR CODE STARTS HERE (our implementation is 3-6 lines)
226
+ self.encoder_layers = nn.ModuleList([TransformerEncoderLayer(hidden = self.hidden,
227
+ num_heads = self.num_heads,
228
+ fcn_hidden = self.fcn_hidden,
229
+ dropout=self.dropout_rate
230
+ )
231
+ for _ in range(self.num_layers)
232
+ ])
233
+
234
+ self.decoder_layers = nn.ModuleList([TransformerDecoderLayer(hidden = self.hidden,
235
+ num_heads = self.num_heads,
236
+ fcn_hidden = self.fcn_hidden,
237
+ dropout=self.dropout_rate
238
+ )
239
+ for _ in range(self.num_layers)
240
+ ])
241
+
242
+ # YOUR CODE ENDS HERE
243
+
244
+ def _add_positions(self, sequence_tensor):
245
+ """Adds positional embeddings to the input tensor.
246
+ Args:
247
+ sequence_tensor: FloatTensor[batch_size, seq_len, hidden]
248
+ """
249
+ seq_len = sequence_tensor.shape[1]
250
+ positions = torch.arange(seq_len, device=sequence_tensor.device)
251
+ positional_emb = self.positional_emb(positions)
252
+ output = sequence_tensor + positional_emb
253
+ return output
254
+
255
+ def forward(
256
+ self,
257
+ input_ids=None,
258
+ encoder_hidden_states=None,
259
+ decoder_input_ids=None,
260
+ key_padding_mask=None,
261
+ ):
262
+ """
263
+ input_ids -> encoder_emb -> encoder ->
264
+ --> decoder(encoder_output, decoder_emb) -> logits
265
+ decoder_input_ids -> decoder_emb ---->
266
+
267
+ Model accepts either input_ids or encoder_hidden_states.
268
+ The former is used for training, the latter is used for inference, because during inference
269
+ we don't have the target sequence and want to forward the decoder multiple times.
270
+ To make the inference more efficient, we can only compute encoder output once and reuse it
271
+ for all decoder steps.
272
+
273
+ Meaning during training you should forward the model like this:
274
+ model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
275
+
276
+ but during inference (generating translation) you should forward the model like this:
277
+ model(encoder_hidden_states=encoder_hidden_states, decoder_input_ids=decoder_input_ids)
278
+
279
+ Args:
280
+ input_ids (LongTensor): Encoder input sequence of size (batch_size, seq_len)
281
+ encoder_hidden_states (FloatTensor): Encoder hidden states of size (batch_size, seq_len, hidden)
282
+ decoder_input_ids (LongTensor) : Decoder input sequence of size (batch_size, out_seq_len)
283
+ key_padding_mask (ByteTensor): Mask of size (batch_size, seq_len) where 1 means that the token is padding
284
+
285
+ Return:
286
+ logits (FloatTensor): Logits for output sequence of size (batch_size, out_seq_len, dec_vocab_size)
287
+
288
+ """
289
+ if input_ids is None and encoder_hidden_states is None:
290
+ raise ValueError("You should provide either input_ids or encoder_hidden_states")
291
+
292
+ if encoder_hidden_states is None:
293
+ encoder_hidden_states = self._encode(input_ids, key_padding_mask)
294
+
295
+ logits = self._decode(encoder_hidden_states, decoder_input_ids, key_padding_mask)
296
+ # print("Targte vocab size", decoder_input_ids.shape)
297
+ # print("logits---------", logits.shape)
298
+
299
+ return logits
300
+
301
+ def _encode(self, input_ids, key_padding_mask):
302
+ # Task 2.5 (2 points)
303
+ # 1. Get source embeddings using self.encoder_embeddings
304
+ # 2. Add positional embedding to encoder embeddings using _add_positions
305
+ # 3. Pass source embeddings through the encoder layers, name them encoder_hidden_states
306
+ # 3a. Remember to use key_padding_mask to mask out padding tokens
307
+ # YOUR CODE STARTS HERE
308
+ encoder_hidden_states = self.encoder_embeddings(input_ids)
309
+ encoder_hidden_states = self._add_positions(encoder_hidden_states)
310
+ for l in self.encoder_layers:
311
+ encoder_hidden_states = l(encoder_hidden_states, key_padding_mask = key_padding_mask)
312
+
313
+ # YOUR CODE ENDS HERE
314
+
315
+ return encoder_hidden_states
316
+
317
+ def _decode(self, encoder_hidden_states, decoder_input_ids, key_padding_mask):
318
+ # TASK 2.6 (2 points)
319
+ # 1. Get decoder embeddings using self.decoder_embeddings
320
+ # 2. Add positional embedding to target embeddings using _add_positions
321
+ # 3.Use decoder embeddings and encoder_hidden_states for the decoder input
322
+ # (please use keyword arguments instead of positional arguments to minimize a chance of a bug)
323
+ # 3a. Remember to use key_padding_mask to mask out padding tokens for the encoder inputs
324
+ # 4. use self.out_proj to get output logits, a.k.a log-probabilies of the next translation tokens
325
+ # YOUR CODE STARTS HERE
326
+ decoder_embedding = self.decoder_embeddings(decoder_input_ids)
327
+ decoder_embedding = self._add_positions(decoder_embedding)
328
+ # print("decoder_Embedding", decoder_embedding.shape)
329
+ for l in self.decoder_layers:
330
+ decoder_embedding = l(decoder_hidden_states = decoder_embedding, encoder_hidden_states=encoder_hidden_states, key_padding_mask = key_padding_mask)
331
+
332
+ logits = self.out_proj(decoder_embedding)
333
+ ## YOUR CODE ENDS HERE
334
+ return logits
335
+
336
+ ##############################################################################
337
+ # Don't worry about any of the code below this line, but feel free to take a look
338
+ # if you are interested in generation or model saving/loading.
339
+ ##############################################################################
340
+ @torch.inference_mode()
341
+ def generate(
342
+ self,
343
+ input_ids,
344
+ *,
345
+ bos_token_id,
346
+ eos_token_id,
347
+ pad_token_id=None,
348
+ key_padding_mask=None,
349
+ max_length=50,
350
+ beam_size=5,
351
+ kind="beam_search",
352
+ ):
353
+ """
354
+ Generate a translation given an input sequence.
355
+
356
+ Args:
357
+ input_ids (LongTensor): Encoder input sequence of size (batch_size, seq_len)
358
+ bos_token_id (int): Beginning of sentence token id
359
+ eos_token_id (int): End of sentence token id
360
+ pad_token_id (int): Padding token id, required if doing beam search
361
+ key_padding_mask (ByteTensor): Mask of size (batch_size, seq_len) where 1 means that the token is padding
362
+ max_length (int): Maximum length of the generated sequence
363
+ beam_size (int): Beam size for beam search
364
+ kind (str): Can be either "greedy" or "beam_search"
365
+
366
+ Return:
367
+ decoded_ids (LongTensor): Decoder output sequence of size (batch_size, seq_len)
368
+ """
369
+ if kind not in ["greedy", "beam_search"]:
370
+ raise ValueError("Unknown kind of generation: {}".format(kind))
371
+ if kind == "beam_search" and pad_token_id is None:
372
+ raise ValueError("Beam search requires a pad_token_id to be provided")
373
+
374
+ if kind == "greedy":
375
+ return self._generate_greedy(
376
+ input_ids=input_ids,
377
+ bos_token_id=bos_token_id,
378
+ eos_token_id=eos_token_id,
379
+ key_padding_mask=key_padding_mask,
380
+ max_length=max_length,
381
+ )
382
+
383
+ # beam search only supports batch size 1
384
+ beam_search_generations = []
385
+ for i in range(input_ids.size(0)):
386
+ _input_ids = input_ids[i].unsqueeze(0)
387
+ _key_padding_mask = key_padding_mask[i].unsqueeze(0) if key_padding_mask is not None else None
388
+
389
+ generated = self._generate_beam_search(
390
+ input_ids=_input_ids,
391
+ bos_token_id=bos_token_id,
392
+ eos_token_id=eos_token_id,
393
+ key_padding_mask=_key_padding_mask,
394
+ max_length=max_length,
395
+ beam_size=beam_size,
396
+ )
397
+
398
+ beam_search_generations.append(generated[0].detach().cpu().tolist())
399
+
400
+ return pad(beam_search_generations, pad_id=eos_token_id)
401
+
402
+ @torch.inference_mode()
403
+ def _generate_greedy(
404
+ self,
405
+ input_ids,
406
+ *,
407
+ bos_token_id,
408
+ eos_token_id,
409
+ key_padding_mask=None,
410
+ max_length=50,
411
+ ):
412
+ """
413
+ Greedy generation of translation. Selects most likely word on every step.
414
+
415
+ Args:
416
+ input_ids (LongTensor): Encoder input sequence of size (batch_size, seq_len)
417
+ max_length (int): Maximum length of the generated sequence
418
+ bos_token_id (int): Beginning of sentence token id
419
+ eos_token_id (int): End of sequence token id
420
+
421
+ Return:
422
+ translation (LongTensor): Decoder output sequence of size (batch_size, out_seq_len)
423
+ where out_seq_len <= max_length
424
+ """
425
+ encoder_hidden_states = self._encode(input_ids, key_padding_mask)
426
+
427
+ decoder_input_ids = torch.full((input_ids.shape[0], 1), bos_token_id, dtype=torch.long, device=input_ids.device)
428
+ translation = torch.zeros((input_ids.shape[0], 0), dtype=torch.long, device=input_ids.device)
429
+
430
+ eos_flags = torch.zeros((input_ids.shape[0],), dtype=torch.uint8, device=input_ids.device)
431
+
432
+ for _ in range(max_length):
433
+ logits = self._decode(encoder_hidden_states, decoder_input_ids, key_padding_mask)
434
+ logits = logits[:, -1, :]
435
+
436
+ next_token_id = torch.argmax(logits, dim=-1)
437
+
438
+ decoder_input_ids = torch.cat((decoder_input_ids, next_token_id.unsqueeze(1)), dim=1)
439
+ translation = torch.cat((translation, next_token_id.unsqueeze(1)), dim=1)
440
+
441
+ eos_flags |= (next_token_id == eos_token_id)
442
+
443
+ if eos_flags.all():
444
+ break
445
+
446
+ return translation
447
+
448
+ @torch.inference_mode()
449
+ def _generate_beam_search(
450
+ self,
451
+ input_ids,
452
+ *,
453
+ bos_token_id,
454
+ eos_token_id,
455
+ key_padding_mask=None,
456
+ beam_size=5,
457
+ max_length=50,
458
+ ):
459
+ """
460
+ Beam search generation of translation.
461
+ Heavily inspired by https://github.com/pcyin/pytorch_basic_nmt
462
+
463
+ Args:
464
+ input_ids (LongTensor): Encoder input sequence of size (batch_size, seq_len)
465
+ max_length (int): Maximum length of the generated sequence
466
+ bos_token_id (int): Beginning of sentence token id
467
+ eos_token_id (int): End of sequence token id
468
+
469
+ Return:
470
+ translation (LongTensor): Decoder output sequence of size (batch_size, out_seq_len)
471
+ where out_seq_len <= max_length
472
+ """
473
+ assert len(input_ids) == 1, "Beam search is only supported for a single input sequence"
474
+ encoder_hidden_states = self._encode(input_ids, key_padding_mask)
475
+ device = input_ids.device
476
+
477
+ hypotheses = [[bos_token_id]]
478
+ hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=device)
479
+ completed_hypotheses = []
480
+
481
+ for _ in range(max_length):
482
+ if len(completed_hypotheses) >= beam_size:
483
+ break
484
+
485
+ hyp_num = len(hypotheses)
486
+ expanded_encoder_hidden_states = encoder_hidden_states.expand(
487
+ hyp_num,
488
+ encoder_hidden_states.size(1),
489
+ encoder_hidden_states.size(2),
490
+ )
491
+
492
+ # [batch_size*hyp_num=1*hyp_num, seq_len, hidden]
493
+ hypotheses_tensor = torch.tensor(hypotheses, dtype=torch.int64, device=device)
494
+ logits = self._decode(expanded_encoder_hidden_states, hypotheses_tensor, key_padding_mask)
495
+ logits = logits[:, -1, :] # [vocab_size]
496
+
497
+ log_p_t = F.log_softmax(logits, dim=-1)
498
+ live_hyp_num = beam_size - len(completed_hypotheses)
499
+
500
+ # [hyp_num] -> [1, hyp_num] -> [hyp_num, vocab_size] -> [hyp_num * vocab_size]
501
+ new_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
502
+ # [live_hyp_num], [live_hyp_num]
503
+ # for indices, the values range from 0 to hyp_num * vocab_size
504
+ top_new_hyp_scores, top_new_hyp_pos = torch.topk(new_hyp_scores, k=live_hyp_num)
505
+
506
+ # hypotheses ids in hyp_scores tensor [hyp_num,]
507
+ prev_hyp_ids = torch.div(top_new_hyp_pos, self.tgt_vocab_size, rounding_mode='floor')
508
+
509
+ # ids of the next words for each hypothesis
510
+ token_ids = top_new_hyp_pos % self.tgt_vocab_size
511
+
512
+ new_hypotheses = []
513
+ new_hyp_scores = []
514
+
515
+ # iterate live_hyp_num times
516
+ for prev_hyp_id, hyp_token_id, cand_new_hyp_score in zip(prev_hyp_ids, token_ids, top_new_hyp_scores):
517
+ prev_hyp_id = prev_hyp_id.item()
518
+ hyp_token_id = hyp_token_id.item()
519
+ cand_new_hyp_score = cand_new_hyp_score.item()
520
+
521
+ new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_token_id]
522
+ if hyp_token_id == eos_token_id:
523
+ completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score))
524
+ else:
525
+ new_hypotheses.append(new_hyp_sent)
526
+ new_hyp_scores.append(cand_new_hyp_score)
527
+
528
+ if len(completed_hypotheses) == beam_size:
529
+ break
530
+
531
+ hypotheses = new_hypotheses
532
+ hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=device)
533
+
534
+ if len(completed_hypotheses) == 0:
535
+ completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item()))
536
+
537
+ completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)
538
+ return torch.LongTensor(completed_hypotheses[0].value).unsqueeze(0)
539
+
540
+ def save_pretrained(self, save_path):
541
+ """Save the model weights to a directory
542
+
543
+ Args:
544
+ save_path: directory to save the model
545
+ """
546
+ config = {
547
+ "num_layers": self.num_layers,
548
+ "hidden": self.hidden,
549
+ "num_heads": self.num_heads,
550
+ "fcn_hidden": self.fcn_hidden,
551
+ "src_vocab_size": self.src_vocab_size,
552
+ "tgt_vocab_size": self.tgt_vocab_size,
553
+ "max_seq_len": self.max_seq_len,
554
+ "dropout": self.dropout_rate,
555
+ }
556
+
557
+ with open(os.path.join(save_path, "model_config.json"), "w") as f:
558
+ json.dump(config, f)
559
+
560
+ state_dict = self.state_dict()
561
+ torch.save(state_dict, os.path.join(save_path, "model.pt"))
562
+
563
+ @classmethod
564
+ def from_pretrained(cls, save_path, map_location=None):
565
+ """Load the model weights from a directory
566
+
567
+ Args:
568
+ save_path: directory to load the model
569
+ """
570
+ if map_location is None and not torch.cuda.is_available():
571
+ map_location = "cpu"
572
+
573
+ with open(os.path.join(save_path, "model_config.json"), "r") as f:
574
+ config = json.load(f)
575
+
576
+ model = cls(**config)
577
+ state_dict = torch.load(os.path.join(save_path, "model.pt"), map_location=map_location)
578
+ model.load_state_dict(state_dict)
579
+ return model
transformer_mt/utils.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from copy import deepcopy
2
+ import random
3
+ import torch
4
+
5
+
6
+ def postprocess_text(preds, labels):
7
+ """Use this function to postprocess generations and labels before BLEU computation."""
8
+ preds = [pred.strip() for pred in preds]
9
+ labels = [[label.strip()] for label in labels]
10
+
11
+ return preds, labels
12
+
13
+
14
+ def pad(sequence_list, pad_id):
15
+ """Pads sequence_list to the longest sequence in the batch with pad_id.
16
+
17
+ Args:
18
+ sequence_list: a list of size batch_size of numpy arrays of different length
19
+ pad_id: int, a pad token id
20
+
21
+ Returns:
22
+ torch.LongTensor of shape [batch_size, max_sequence_len]
23
+ """
24
+ max_len = max(len(x) for x in sequence_list)
25
+ padded_sequence_list = []
26
+ for sequence in sequence_list:
27
+ padding = [pad_id] * (max_len - len(sequence))
28
+ padded_sequence = sequence + padding
29
+ padded_sequence_list.append(padded_sequence)
30
+
31
+ return torch.LongTensor(padded_sequence_list)
32
+
33
+
34
+ def sample_small_debug_dataset(raw_datasets):
35
+ random_indices = random.sample(list(range(len(raw_datasets["train"]))), 100)
36
+ subset = raw_datasets["train"].select(random_indices)
37
+ raw_datasets["train"] = deepcopy(subset)
38
+ if "validation" in raw_datasets:
39
+ raw_datasets["validation"] = deepcopy(subset)
40
+ if "test" in raw_datasets:
41
+ raw_datasets["test"] = deepcopy(subset)
42
+ return raw_datasets
transformer_mt_roberta/__init__.py ADDED
File without changes
transformer_mt_roberta/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (168 Bytes). View file
 
transformer_mt_roberta/__pycache__/modeling_attention.cpython-37.pyc ADDED
Binary file (2.96 kB). View file
 
transformer_mt_roberta/__pycache__/modeling_transformer.cpython-37.pyc ADDED
Binary file (11.4 kB). View file
 
transformer_mt_roberta/__pycache__/modeling_transformer_final.cpython-37.pyc ADDED
Binary file (8.15 kB). View file
 
transformer_mt_roberta/__pycache__/utils.cpython-37.pyc ADDED
Binary file (1.79 kB). View file
 
transformer_mt_roberta/modeling_attention.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2022 Vladislav Lialin and Namrata Shivagunde
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ #i Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import torch
18
+ import torch.nn as nn
19
+
20
+
21
+ class MultiHeadAttention(nn.Module):
22
+ def __init__(self, input_size, hidden, num_heads, causal=False):
23
+ """Multi-head attention module which computes [softmax(xQ_h @ xK_h^T) @ xV: ...] @ U
24
+
25
+ Can work as both self-attention or cross-attention (if kv is provided to .forward).
26
+
27
+ Args:
28
+ causal: use causal masking (do not allow target to look to the future or current token of source)
29
+ """
30
+ if hidden % num_heads:
31
+ raise ValueError(f"hidden should be divisible by num_heads, "
32
+ f"but got hidden={hidden} and num_heads={num_heads}")
33
+ super().__init__()
34
+
35
+ self.k = nn.Linear(input_size, hidden)
36
+ self.q = nn.Linear(input_size, hidden)
37
+ self.v = nn.Linear(input_size, hidden)
38
+ self.mix = nn.Linear(hidden, hidden)
39
+
40
+ self.num_heads = num_heads
41
+ self.head_size = hidden // num_heads
42
+ self.scale = self.head_size ** 0.5
43
+ self.causal = causal # causal masking
44
+
45
+ def forward(self, q, kv=None, key_padding_mask=None, return_attention=False):
46
+ """[Softmax(source Q_1 @ target K_1^T) @ target V_1 : ... ) @ x V_heads] @ U
47
+
48
+ Performs self-attention if kv is not specified.
49
+ In this case, kv = q and kv_seq_len = query_seq_len.
50
+
51
+ Args:
52
+ q: FloatTensor[batch_size, query_seq_len, input_size]
53
+ kv (target) : optional, FloatTensor[batch_size, kv_seq_len, input_size]
54
+ key_padding_mask: BoolTensor[batch_size, kv_seq_len] 0 means unpadded, 1 means padded
55
+
56
+ Returns:
57
+ FloatTensor[batch_size, seq_len, hidden]
58
+ """
59
+
60
+ # Task 1.1 (1 point)
61
+ # Update this function with cross-attention mechanism
62
+ # If target is None, then target (kv) and source (q) will be same.
63
+ # Define k, q, v using self.k, self.q and self.v based on if the target exists or not
64
+ # Note : Please write shape of each tensor for each line of code
65
+ ## YOUR CODE STARTS HERE## ~ 2 lines code
66
+ k = self.k(kv) if kv!=None else self.k(q)
67
+ # print('k', k.shape, 'q', q.shape)
68
+ q = self.q(q)
69
+ v = self.v(kv) if kv!=None else self.v(q)
70
+ # print("KV", kv)
71
+
72
+ # YOUR CODE ENDS HERE
73
+
74
+ bs, attending_seq, _ = q.shape
75
+ attended_seq = k.shape[1]
76
+
77
+ # [b, s, h] -> [b, h, s] -> [b * heads, h / heads, s] -> [b * heads, s, h / heads]
78
+ k = k.transpose(1, 2).reshape(bs * self.num_heads, self.head_size, -1).transpose(1, 2).contiguous() # [batch * num_heads, seq, hidden / num_heads]
79
+ q = q.transpose(1, 2).reshape(bs * self.num_heads, self.head_size, -1).transpose(1, 2).contiguous()
80
+ v = v.transpose(1, 2).reshape(bs * self.num_heads, self.head_size, -1).transpose(1, 2).contiguous()
81
+
82
+ scores = q @ k.transpose(1, 2) / self.scale # [batch * num_heads, attending_seq, attended_seq]
83
+ assert scores.shape == (bs * self.num_heads, attending_seq, attended_seq)
84
+
85
+
86
+ if key_padding_mask is not None:
87
+ # Task 1.2 (1 point)
88
+ # Padding
89
+ # Set the scores corresponding to padded positions (key_padding_mask == 1) to -inf
90
+ #
91
+ # You might need to reshape the scores to [batch_size, seq_len, seq_len]
92
+ # in this case, remember to reshape them back
93
+ # Our implementation is 3 lines
94
+ # YOUR CODE STARTS HERE
95
+ # print(scores.shape, key_padding_mask.unsqueeze(-2).shape)
96
+
97
+
98
+ scores = scores.reshape(self.num_heads, bs, attending_seq, attended_seq)
99
+ scores_check = scores.reshape(bs, self.num_heads, attending_seq, -1)
100
+ # print("Socres:", scores.shape, "Scores_Check:", scores_check.shape)
101
+ # print('----')
102
+ scores = scores.masked_fill(key_padding_mask.unsqueeze(-2)==1, value = float("-inf"))
103
+ scores = scores.view(bs * self.num_heads, attending_seq, attended_seq)
104
+
105
+
106
+ # YOUR CODE ENDS HERE
107
+
108
+ assert scores.size() == (bs * self.num_heads, attending_seq, attended_seq),\
109
+ f"scores have wrong shape. Expected {(bs * self.num_heads, attending_seq, attended_seq)}, got {scores.size()}"
110
+
111
+ if self.causal:
112
+ causal_mask = torch.triu(torch.ones(attending_seq, attended_seq, dtype=torch.bool, device=scores.device), diagonal=1)
113
+ scores.masked_fill_(causal_mask.bool().unsqueeze(0), float("-inf"))
114
+
115
+ probs = torch.softmax(scores, dim=-1) # [batch * num_heads, tgt_seq, src_seq]
116
+ att = probs @ v # [batch * num_heads, tgt_seq, hidden / num_heads]
117
+
118
+ # [b * heads, s, h / heads] -> [b * heads, h / heads, s] -> [b, h, s] -> [b, s, h]
119
+ att = att.transpose(1, 2).reshape(bs, -1, attending_seq).transpose(1, 2).contiguous()
120
+
121
+ att = self.mix(att)
122
+
123
+ if return_attention:
124
+ return att, probs
125
+
126
+ return att
transformer_mt_roberta/modeling_transformer_final.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from collections import namedtuple
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+
9
+ from transformer_mt.modeling_attention import MultiHeadAttention
10
+ from transformer_mt.utils import pad
11
+ from transformers import AutoTokenizer, AutoModelForMaskedML
12
+
13
+ Hypothesis = namedtuple("Hypothesis", ["value", "score"])
14
+
15
+ class TransformerDecoderLayer(nn.Module):
16
+ def __init__(self, hidden, num_heads, fcn_hidden, dropout=0.0):
17
+ super().__init__()
18
+
19
+
20
+ self.self_attention = MultiHeadAttention(
21
+ input_size=hidden,
22
+ hidden=hidden,
23
+ num_heads=num_heads,
24
+ causal=True,
25
+ )
26
+
27
+ self.cross_attention = MultiHeadAttention(
28
+ input_size=hidden,
29
+ hidden=hidden,
30
+ num_heads=num_heads,
31
+ causal=False,
32
+ )
33
+
34
+ self.self_att_layer_norm = nn.LayerNorm(hidden)
35
+ self.cross_att_layer_norm = nn.LayerNorm(hidden)
36
+
37
+ self.fcn = nn.Sequential(
38
+ nn.Linear(hidden, fcn_hidden),
39
+ nn.ReLU(),
40
+ nn.Linear(fcn_hidden, hidden),
41
+ )
42
+ self.fcn_layer_norm = nn.LayerNorm(hidden)
43
+ self.dropout = nn.Dropout(dropout)
44
+
45
+ # YOUR CODE ENDS HERE
46
+
47
+ def forward(self, decoder_hidden_states, encoder_hidden_states, key_padding_mask=None):
48
+
49
+ residual_1 = decoder_hidden_states
50
+ out = self.self_attention(decoder_hidden_states, key_padding_mask=None)
51
+ out = self.self_att_layer_norm(residual_1 + out)
52
+ residual_2 = out
53
+ out = self.cross_attention(q = out, kv = encoder_hidden_states, key_padding_mask = key_padding_mask)
54
+
55
+ out = self.cross_att_layer_norm(out+residual_2)
56
+ out = self.fcn(out)
57
+ out = self.dropout(out)
58
+ residual_3 = out
59
+ out = self.fcn_layer_norm(out+residual_3)
60
+
61
+ return out
62
+
63
+
64
+ class TransfomerEncoderDecoderModel(nn.Module):
65
+ def __init__(
66
+ self,
67
+ *,
68
+ num_layers,
69
+ hidden,
70
+ num_heads,
71
+ fcn_hidden,
72
+ max_seq_len,
73
+ src_vocab_size,
74
+ tgt_vocab_size,
75
+ dropout=0.1,
76
+ ):
77
+ super().__init__()
78
+ self.src_vocab_size = src_vocab_size
79
+ self.tgt_vocab_size = tgt_vocab_size
80
+ self.num_layers = num_layers
81
+ self.hidden = hidden
82
+ self.num_heads = num_heads
83
+ self.fcn_hidden = fcn_hidden
84
+ self.dropout_rate = dropout
85
+ self.max_seq_len = max_seq_len
86
+
87
+ self.decoder_embeddings = nn.Embedding(self.tgt_vocab_size, self.hidden)
88
+ self.positional_emb = nn.Embedding(self.max_seq_len, self.hidden)
89
+
90
+ self.out_proj = nn.Linear(self.hidden, self.tgt_vocab_size)
91
+
92
+ self.dropout = nn.Dropout(self.dropout_rate)
93
+
94
+ self.encoder = AutoModelForMaskedML.from_pretrained("flax-community/roberta_base_danish", output_hidden_states=True)
95
+
96
+ self.decoder_layers = nn.ModuleList([TransformerDecoderLayer(hidden = self.hidden,
97
+ num_heads = self.num_heads,
98
+ fcn_hidden = self.fcn_hidden,
99
+ dropout=self.dropout_rate
100
+ )
101
+ for _ in range(self.num_layers)
102
+ ])
103
+
104
+ # YOUR CODE ENDS HERE
105
+
106
+ def _add_positions(self, sequence_tensor):
107
+
108
+ seq_len = sequence_tensor.shape[1]
109
+ positions = torch.arange(seq_len, device=sequence_tensor.device)
110
+ positional_emb = self.positional_emb(positions)
111
+ output = sequence_tensor + positional_emb
112
+ return output
113
+
114
+ def forward(
115
+ self,
116
+ input_ids=None,
117
+ encoder_hidden_states=None,
118
+ decoder_input_ids=None,
119
+ key_padding_mask=None,
120
+ ):
121
+
122
+ if input_ids is None and encoder_hidden_states is None:
123
+ raise ValueError("You should provide either input_ids or encoder_hidden_states")
124
+
125
+ if encoder_hidden_states is None:
126
+ encoder_hidden_states = self.encoder(input_ids, output_hidden_states=True)
127
+ encoder_hidden_states = encoder_hidden_states.hidden_states[-1]
128
+ # print( encoder_hidden_states.shape)
129
+
130
+ logits = self._decode(encoder_hidden_states, decoder_input_ids, key_padding_mask)
131
+ # print(logits.shape)
132
+
133
+
134
+ return logits
135
+
136
+ def _decode(self, encoder_hidden_states, decoder_input_ids, key_padding_mask):
137
+
138
+ decoder_embedding = self.decoder_embeddings(decoder_input_ids)
139
+ decoder_embedding = self._add_positions(decoder_embedding)
140
+
141
+ for l in self.decoder_layers:
142
+ decoder_embedding = l(decoder_hidden_states = decoder_embedding, encoder_hidden_states=encoder_hidden_states, key_padding_mask = key_padding_mask)
143
+
144
+ logits = self.out_proj(decoder_embedding)
145
+ ## YOUR CODE ENDS HERE
146
+ return logits
147
+
148
+
149
+ @torch.inference_mode()
150
+ def generate(
151
+ self,
152
+ input_ids,
153
+ *,
154
+ bos_token_id,
155
+ eos_token_id,
156
+ pad_token_id=None,
157
+ key_padding_mask=None,
158
+ max_length=50,
159
+ beam_size=5,
160
+ kind="beam_search",
161
+ ):
162
+
163
+ if kind not in ["greedy", "beam_search"]:
164
+ raise ValueError("Unknown kind of generation: {}".format(kind))
165
+ if kind == "beam_search" and pad_token_id is None:
166
+ raise ValueError("Beam search requires a pad_token_id to be provided")
167
+
168
+ if kind == "greedy":
169
+ return self._generate_greedy(
170
+ input_ids=input_ids,
171
+ bos_token_id=bos_token_id,
172
+ eos_token_id=eos_token_id,
173
+ key_padding_mask=key_padding_mask,
174
+ max_length=max_length,
175
+ )
176
+
177
+ # beam search only supports batch size 1
178
+ beam_search_generations = []
179
+ for i in range(input_ids.size(0)):
180
+ _input_ids = input_ids[i].unsqueeze(0)
181
+ _key_padding_mask = key_padding_mask[i].unsqueeze(0) if key_padding_mask is not None else None
182
+
183
+ generated = self._generate_beam_search(
184
+ input_ids=_input_ids,
185
+ bos_token_id=bos_token_id,
186
+ eos_token_id=eos_token_id,
187
+ key_padding_mask=_key_padding_mask,
188
+ max_length=max_length,
189
+ beam_size=beam_size,
190
+ )
191
+
192
+ beam_search_generations.append(generated[0].detach().cpu().tolist())
193
+
194
+ return pad(beam_search_generations, pad_id=eos_token_id)
195
+
196
+ @torch.inference_mode()
197
+ def _generate_greedy(
198
+ self,
199
+ input_ids,
200
+ *,
201
+ bos_token_id,
202
+ eos_token_id,
203
+ key_padding_mask=None,
204
+ max_length=50,
205
+ ):
206
+
207
+ # encoder_hidden_states = self._encode(input_ids, key_padding_mask)
208
+ encoder_hidden_states = self.encoder(input_ids, output_hidden_states=True, attention_mask=key_padding_mask)
209
+ encoder_hidden_states = encoder_hidden_states.hidden_states[-1]
210
+
211
+
212
+ decoder_input_ids = torch.full((input_ids.shape[0], 1), bos_token_id, dtype=torch.long, device=input_ids.device)
213
+ translation = torch.zeros((input_ids.shape[0], 0), dtype=torch.long, device=input_ids.device)
214
+
215
+ eos_flags = torch.zeros((input_ids.shape[0],), dtype=torch.uint8, device=input_ids.device)
216
+
217
+ for _ in range(max_length):
218
+ logits = self._decode(encoder_hidden_states, decoder_input_ids, key_padding_mask)
219
+ logits = logits[:, -1, :]
220
+
221
+ next_token_id = torch.argmax(logits, dim=-1)
222
+
223
+ decoder_input_ids = torch.cat((decoder_input_ids, next_token_id.unsqueeze(1)), dim=1)
224
+ translation = torch.cat((translation, next_token_id.unsqueeze(1)), dim=1)
225
+
226
+ eos_flags |= (next_token_id == eos_token_id)
227
+
228
+ if eos_flags.all():
229
+ break
230
+
231
+ return translation
232
+
233
+ @torch.inference_mode()
234
+ def _generate_beam_search(
235
+ self,
236
+ input_ids,
237
+ *,
238
+ bos_token_id,
239
+ eos_token_id,
240
+ key_padding_mask=None,
241
+ beam_size=5,
242
+ max_length=50,
243
+ ):
244
+
245
+ assert len(input_ids) == 1, "Beam search is only supported for a single input sequence"
246
+ #encoder_hidden_states = self._encode(input_ids, key_padding_mask)
247
+ encoder_hidden_states = self.encoder(input_ids, output_hidden_states=True, attention_mask=key_padding_mask)
248
+ encoder_hidden_states = encoder_hidden_states.hidden_states[-1]
249
+ device = input_ids.device
250
+
251
+ hypotheses = [[bos_token_id]]
252
+ hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=device)
253
+ completed_hypotheses = []
254
+
255
+ for _ in range(max_length):
256
+ if len(completed_hypotheses) >= beam_size:
257
+ break
258
+
259
+ hyp_num = len(hypotheses)
260
+ expanded_encoder_hidden_states = encoder_hidden_states.expand(
261
+ hyp_num,
262
+ encoder_hidden_states.size(1),
263
+ encoder_hidden_states.size(2),
264
+ )
265
+
266
+ # [batch_size*hyp_num=1*hyp_num, seq_len, hidden]
267
+ hypotheses_tensor = torch.tensor(hypotheses, dtype=torch.int64, device=device)
268
+ logits = self._decode(expanded_encoder_hidden_states, hypotheses_tensor, key_padding_mask)
269
+ logits = logits[:, -1, :] # [vocab_size]
270
+
271
+ log_p_t = F.log_softmax(logits, dim=-1)
272
+ live_hyp_num = beam_size - len(completed_hypotheses)
273
+
274
+ # [hyp_num] -> [1, hyp_num] -> [hyp_num, vocab_size] -> [hyp_num * vocab_size]
275
+ new_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
276
+ # [live_hyp_num], [live_hyp_num]
277
+ # for indices, the values range from 0 to hyp_num * vocab_size
278
+ top_new_hyp_scores, top_new_hyp_pos = torch.topk(new_hyp_scores, k=live_hyp_num)
279
+
280
+ # hypotheses ids in hyp_scores tensor [hyp_num,]
281
+ prev_hyp_ids = torch.div(top_new_hyp_pos, self.tgt_vocab_size, rounding_mode='floor')
282
+
283
+ # ids of the next words for each hypothesis
284
+ token_ids = top_new_hyp_pos % self.tgt_vocab_size
285
+
286
+ new_hypotheses = []
287
+ new_hyp_scores = []
288
+
289
+ # iterate live_hyp_num times
290
+ for prev_hyp_id, hyp_token_id, cand_new_hyp_score in zip(prev_hyp_ids, token_ids, top_new_hyp_scores):
291
+ prev_hyp_id = prev_hyp_id.item()
292
+ hyp_token_id = hyp_token_id.item()
293
+ cand_new_hyp_score = cand_new_hyp_score.item()
294
+
295
+ new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_token_id]
296
+ if hyp_token_id == eos_token_id:
297
+ completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score))
298
+ else:
299
+ new_hypotheses.append(new_hyp_sent)
300
+ new_hyp_scores.append(cand_new_hyp_score)
301
+
302
+ if len(completed_hypotheses) == beam_size:
303
+ break
304
+
305
+ hypotheses = new_hypotheses
306
+ hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=device)
307
+
308
+ if len(completed_hypotheses) == 0:
309
+ completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item()))
310
+
311
+ completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)
312
+ return torch.LongTensor(completed_hypotheses[0].value).unsqueeze(0)
313
+
314
+ def save_pretrained(self, save_path):
315
+ """Save the model weights to a directory
316
+
317
+ Args:
318
+ save_path: directory to save the model
319
+ """
320
+ config = {
321
+ "num_layers": self.num_layers,
322
+ "hidden": self.hidden,
323
+ "num_heads": self.num_heads,
324
+ "fcn_hidden": self.fcn_hidden,
325
+ "src_vocab_size": self.src_vocab_size,
326
+ "tgt_vocab_size": self.tgt_vocab_size,
327
+ "max_seq_len": self.max_seq_len,
328
+ "dropout": self.dropout_rate,
329
+ }
330
+
331
+ with open(os.path.join(save_path, "model_config.json"), "w") as f:
332
+ json.dump(config, f)
333
+
334
+ state_dict = self.state_dict()
335
+ torch.save(state_dict, os.path.join(save_path, "model.pt"))
336
+
337
+ @classmethod
338
+ def from_pretrained(cls, save_path, map_location=None):
339
+ """Load the model weights from a directory
340
+
341
+ Args:
342
+ save_path: directory to load the model
343
+ """
344
+ if map_location is None and not torch.cuda.is_available():
345
+ map_location = "cpu"
346
+
347
+ with open(os.path.join(save_path, "model_config.json"), "r") as f:
348
+ config = json.load(f)
349
+
350
+ model = cls(**config)
351
+ state_dict = torch.load(os.path.join(save_path, "model.pt"), map_location=map_location)
352
+ model.load_state_dict(state_dict)
353
+ return model
transformer_mt_roberta/utils.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from copy import deepcopy
2
+ import random
3
+ import torch
4
+
5
+
6
+ def postprocess_text(preds, labels):
7
+ """Use this function to postprocess generations and labels before BLEU computation."""
8
+ preds = [pred.strip() for pred in preds]
9
+ labels = [[label.strip()] for label in labels]
10
+
11
+ return preds, labels
12
+
13
+
14
+ def pad(sequence_list, pad_id):
15
+ """Pads sequence_list to the longest sequence in the batch with pad_id.
16
+
17
+ Args:
18
+ sequence_list: a list of size batch_size of numpy arrays of different length
19
+ pad_id: int, a pad token id
20
+
21
+ Returns:
22
+ torch.LongTensor of shape [batch_size, max_sequence_len]
23
+ """
24
+ max_len = max(len(x) for x in sequence_list)
25
+ padded_sequence_list = []
26
+ for sequence in sequence_list:
27
+ padding = [pad_id] * (max_len - len(sequence))
28
+ padded_sequence = sequence + padding
29
+ padded_sequence_list.append(padded_sequence)
30
+
31
+ return torch.LongTensor(padded_sequence_list)
32
+
33
+
34
+ def sample_small_debug_dataset(raw_datasets):
35
+ random_indices = random.sample(list(range(len(raw_datasets["train"]))), 100)
36
+ subset = raw_datasets["train"].select(random_indices)
37
+ raw_datasets["train"] = deepcopy(subset)
38
+ if "validation" in raw_datasets:
39
+ raw_datasets["validation"] = deepcopy(subset)
40
+ if "test" in raw_datasets:
41
+ raw_datasets["test"] = deepcopy(subset)
42
+ return raw_datasets