Spaces:

ftakelait
/

da_en_translation

Sleeping

App Files Files Community

ftakelait commited on May 3, 2022

Commit

b1c0f8d

•

1 Parent(s): 23a7f71

Add application files

Browse files

Files changed (28) hide show

app.py +68 -0
da_en_RoBERTa_pretrained/en_tokenizer/special_tokens_map.json +1 -0
da_en_RoBERTa_pretrained/en_tokenizer/tokenizer.json +0 -0
da_en_RoBERTa_pretrained/en_tokenizer/tokenizer_config.json +1 -0
da_en_RoBERTa_pretrained/model.pt +3 -0
da_en_RoBERTa_pretrained/model_config.json +1 -0
da_en_output_dir/da_tokenizer/special_tokens_map.json +1 -0
da_en_output_dir/da_tokenizer/tokenizer.json +0 -0
da_en_output_dir/da_tokenizer/tokenizer_config.json +1 -0
da_en_output_dir/en_tokenizer/special_tokens_map.json +1 -0
da_en_output_dir/en_tokenizer/tokenizer.json +0 -0
da_en_output_dir/en_tokenizer/tokenizer_config.json +1 -0
da_en_output_dir/model.pt +3 -0
da_en_output_dir/model_config.json +1 -0
requirements.txt +5 -0
transformer_mt/__init__.py +0 -0
transformer_mt/modeling_attention.py +126 -0
transformer_mt/modeling_transformer.py +579 -0
transformer_mt/utils.py +42 -0
transformer_mt_roberta/__init__.py +0 -0
transformer_mt_roberta/__pycache__/__init__.cpython-37.pyc +0 -0
transformer_mt_roberta/__pycache__/modeling_attention.cpython-37.pyc +0 -0
transformer_mt_roberta/__pycache__/modeling_transformer.cpython-37.pyc +0 -0
transformer_mt_roberta/__pycache__/modeling_transformer_final.cpython-37.pyc +0 -0
transformer_mt_roberta/__pycache__/utils.cpython-37.pyc +0 -0
transformer_mt_roberta/modeling_attention.py +126 -0
transformer_mt_roberta/modeling_transformer_final.py +353 -0
transformer_mt_roberta/utils.py +42 -0

app.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import warnings
+from cryptography.utils import CryptographyDeprecationWarning
+with warnings.catch_warnings():
+    warnings.filterwarnings('ignore', category=CryptographyDeprecationWarning)
+    import paramiko
+import gradio as gr
+#from transformers import pipeline
+from transformers import PreTrainedTokenizerFast, AutoTokenizer
+from transformers import PreTrainedTokenizerFast
+from transformer_mt.modeling_transformer import TransfomerEncoderDecoderModel
+from transformer_mt_roberta.modeling_transformer_final import TransfomerEncoderDecoderModel as mt_roberta
+#translation_pipeline = pipeline('translation_en_to_fr')
+# seting up translation transformer into Gradio
+#def translator_fn(text_input):
+#    results = translation_pipeline(text_input)
+#    return results[0]['translation_text']
+# def translator_fn_baseline(text_in):
+#     source_tokenizer = PreTrainedTokenizerFast.from_pretrained("da_en_output_dir/da_tokenizer")
+#     target_tokenizer = PreTrainedTokenizerFast.from_pretrained("da_en_output_dir/en_tokenizer")
+#     model = TransfomerEncoderDecoderModel.from_pretrained("da_en_output_dir")
+<<<<<<< HEAD
+#
+=======
+#
+>>>>>>> adb80531e202c58b4ab91375bc391ab50bbc882f
+#     input_ids = source_tokenizer.encode(text_in, return_tensors="pt")
+#     output_ids = model.generate(
+#         input_ids,
+#         max_length=10,
+#         bos_token_id=target_tokenizer.bos_token_id,
+#         eos_token_id=target_tokenizer.eos_token_id,
+#         pad_token_id=target_tokenizer.pad_token_id,
+#      )
+<<<<<<< HEAD
+#
+=======
+#
+>>>>>>> adb80531e202c58b4ab91375bc391ab50bbc882f
+#     return target_tokenizer.decode(output_ids[0])
+def translator_fn_roberta(text_in):
+    source_tokenizer_pretrained_roberta = AutoTokenizer.from_pretrained("flax-community/roberta-base-danish")
+    target_tokenizer_pretrained_roberta = PreTrainedTokenizerFast.from_pretrained("da_en_output_dir/en_tokenizer")
+    model_pretrained_roberta = mt_roberta.from_pretrained("da_en_RoBERTa_pretrained")
+    input_ids_pretrained_roberta = source_tokenizer_pretrained_roberta.encode(text_in, return_tensors="pt")
+    output_ids_pretrained_roberta = input_ids_pretrained_roberta.generate(
+        input_ids_pretrained_roberta,
+        max_length=10,
+        bos_token_id=target_tokenizer_pretrained_roberta.bos_token_id,
+        eos_token_id=target_tokenizer_pretrained_roberta.eos_token_id,
+        pad_token_id=target_tokenizer_pretrained_roberta.pad_token_id,
+     )
+    return target_tokenizer_pretrained_roberta.decode(output_ids_pretrained_roberta[0])
+iface = gr.Interface(fn=translator_fn_roberta,
+                     inputs=gr.inputs.Textbox(lines=2, placeholder=None, label="Your Danish text goes here."),
+                     outputs=['text'],   # a list should match the number of values returned by fn to have one input and 2 putputs.
+                     description = "This App translates text from Danish to the English language.",
+                     title = "Danish to English Translator App",
+                     theme = "peach")
+iface.launch(share=False, enable_queue=True)

da_en_RoBERTa_pretrained/en_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]"}

da_en_RoBERTa_pretrained/en_tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

da_en_RoBERTa_pretrained/en_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]", "tokenizer_class": "PreTrainedTokenizerFast"}

da_en_RoBERTa_pretrained/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43e9463469dfeb0d2c5fed75b6181ec570e95fda4c6565c6f80387782f1aa618
+size 885137451

da_en_RoBERTa_pretrained/model_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"num_layers": 6, "hidden": 768, "num_heads": 8, "fcn_hidden": 2048, "src_vocab_size": 32000, "tgt_vocab_size": 32000, "max_seq_len": 128, "dropout": 0.1}

da_en_output_dir/da_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]"}

da_en_output_dir/da_tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

da_en_output_dir/da_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]", "tokenizer_class": "PreTrainedTokenizerFast"}

da_en_output_dir/en_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]"}

da_en_output_dir/en_tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

da_en_output_dir/en_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]", "tokenizer_class": "PreTrainedTokenizerFast"}

da_en_output_dir/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d93af21df63a573aac135ee8e6a3e984424471f07e707a942f660be1854f1067
+size 616931903

da_en_output_dir/model_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"num_layers": 6, "hidden": 768, "num_heads": 8, "fcn_hidden": 2048, "src_vocab_size": 32000, "tgt_vocab_size": 32000, "max_seq_len": 128, "dropout": 0.1}

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch >= 1.3
+datasets >= 1.8.0
+tokenizers
+wandb
+transformers

transformer_mt/__init__.py ADDED Viewed

File without changes

transformer_mt/modeling_attention.py ADDED Viewed

	@@ -0,0 +1,126 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2022 Vladislav Lialin and Namrata Shivagunde
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+class MultiHeadAttention(nn.Module):
+    def __init__(self, input_size, hidden, num_heads, causal=False):
+        """Multi-head attention module which computes [softmax(xQ_h @ xK_h^T) @ xV: ...] @ U
+        Can work as both self-attention or cross-attention (if kv is provided to .forward).
+        Args:
+            causal: use causal masking (do not allow target to look to the future or current token of source)
+        """
+        if hidden % num_heads:
+            raise ValueError(f"hidden should be divisible by num_heads, "
+                             f"but got hidden={hidden} and num_heads={num_heads}")
+        super().__init__()
+        self.k = nn.Linear(input_size, hidden)
+        self.q = nn.Linear(input_size, hidden)
+        self.v = nn.Linear(input_size, hidden)
+        self.mix = nn.Linear(hidden, hidden)
+        self.num_heads = num_heads
+        self.head_size = hidden // num_heads
+        self.scale = self.head_size ** 0.5
+        self.causal = causal  # causal masking
+    def forward(self, q, kv=None, key_padding_mask=None, return_attention=False):
+        """[Softmax(source Q_1 @ target K_1^T) @ target V_1 : ... ) @ x V_heads] @ U
+        Performs self-attention if kv is not specified.
+        In this case, kv = q and kv_seq_len = query_seq_len.
+        Args:
+            q: FloatTensor[batch_size, query_seq_len, input_size]
+            kv (target) : optional, FloatTensor[batch_size, kv_seq_len, input_size]
+            key_padding_mask: BoolTensor[batch_size, kv_seq_len] 0 means unpadded, 1 means padded
+        Returns:
+            FloatTensor[batch_size, seq_len, hidden]
+        """
+        # Task 1.1 (1 point)
+        # Update this function with cross-attention mechanism
+        # If target is None, then target (kv) and source (q) will be same.
+        # Define k, q, v using self.k, self.q and self.v based on if the target exists or not
+        # Note : Please write shape of each tensor for each line of code
+        ## YOUR CODE STARTS HERE## ~ 2 lines code
+        k = self.k(kv) if kv!=None else self.k(q)
+#         print('k', k.shape, 'q', q.shape)
+        q = self.q(q)
+        v = self.v(kv) if kv!=None else self.v(q)
+#         print("KV", kv)
+        # YOUR CODE ENDS HERE
+        bs, attending_seq, _ = q.shape
+        attended_seq = k.shape[1]
+        # [b, s, h] -> [b, h, s] -> [b * heads, h / heads, s] -> [b * heads, s, h / heads]
+        k = k.transpose(1, 2).reshape(bs * self.num_heads, self.head_size, -1).transpose(1, 2).contiguous()  # [batch * num_heads, seq, hidden / num_heads]
+        q = q.transpose(1, 2).reshape(bs * self.num_heads, self.head_size, -1).transpose(1, 2).contiguous()
+        v = v.transpose(1, 2).reshape(bs * self.num_heads, self.head_size, -1).transpose(1, 2).contiguous()
+        scores = q @ k.transpose(1, 2) / self.scale  # [batch * num_heads, attending_seq, attended_seq]
+        assert scores.shape == (bs * self.num_heads, attending_seq, attended_seq)
+        if key_padding_mask is not None:
+            # Task 1.2 (1 point)
+            # Padding
+            # Set the scores corresponding to padded positions (key_padding_mask == 1) to -inf
+            #
+            # You might need to reshape the scores to [batch_size, seq_len, seq_len]
+            # in this case, remember to reshape them back
+            # Our implementation is 3 lines
+            # YOUR CODE STARTS HERE
+#             print(scores.shape, key_padding_mask.unsqueeze(-2).shape)
+            scores = scores.reshape(self.num_heads, bs,  attending_seq, attended_seq)
+            scores_check = scores.reshape(bs, self.num_heads, attending_seq, -1)
+#             print("Socres:", scores.shape, "Scores_Check:", scores_check.shape)
+#             print('----')
+            scores = scores.masked_fill(key_padding_mask.unsqueeze(-2)==1, value = float("-inf"))
+            scores = scores.view(bs * self.num_heads, attending_seq, attended_seq)
+            # YOUR CODE ENDS HERE
+        assert scores.size() == (bs * self.num_heads, attending_seq, attended_seq),\
+            f"scores have wrong shape. Expected {(bs * self.num_heads, attending_seq, attended_seq)}, got {scores.size()}"
+        if self.causal:
+            causal_mask = torch.triu(torch.ones(attending_seq, attended_seq, dtype=torch.bool, device=scores.device), diagonal=1)
+            scores.masked_fill_(causal_mask.bool().unsqueeze(0), float("-inf"))
+        probs = torch.softmax(scores, dim=-1)  # [batch * num_heads, tgt_seq, src_seq]
+        att = probs @ v  # [batch * num_heads, tgt_seq, hidden / num_heads]
+        # [b * heads, s, h / heads] -> [b * heads, h / heads, s] -> [b, h, s] -> [b, s, h]
+        att = att.transpose(1, 2).reshape(bs, -1, attending_seq).transpose(1, 2).contiguous()
+        att = self.mix(att)
+        if return_attention:
+            return att, probs
+        return att

transformer_mt/modeling_transformer.py ADDED Viewed

	@@ -0,0 +1,579 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2022 Vladislav Lialin and Namrata Shivagunde
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import json
+from collections import namedtuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformer_mt.modeling_attention import MultiHeadAttention
+from transformer_mt.utils import pad
+Hypothesis = namedtuple("Hypothesis", ["value", "score"])
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self, hidden, num_heads, fcn_hidden, dropout=0.0, causal=False):
+        super().__init__()
+        self.self_attention = MultiHeadAttention(
+            input_size=hidden,
+            hidden=hidden,
+            num_heads=num_heads,
+            causal=causal,
+        )
+        self.att_layer_norm = nn.LayerNorm(hidden)
+        self.fcn = nn.Sequential(
+            nn.Linear(hidden, fcn_hidden),
+            nn.ReLU(),
+            nn.Linear(fcn_hidden, hidden),
+        )
+        self.fcn_layer_norm = nn.LayerNorm(hidden)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, key_padding_mask=None):
+        """Self-Attention -> residual -> LayerNorm -> FCN -> residual -> LayerNorm
+        Args:
+            x: FloatTensor[batch_size, seq_len, input_size]
+        Returns:
+            FloatTensor[batch_size, seq_len, hidden]
+        """
+#         print('calling encode', key_padding_mask.shape)
+        residual = x
+        x = self.self_attention(x, key_padding_mask=key_padding_mask)
+        x = self.att_layer_norm(x + residual)
+        residual = x
+        x = self.fcn(x)
+        x = self.dropout(x)
+        x = self.fcn_layer_norm(x + residual)
+        return x
+class TransformerDecoderLayer(nn.Module):
+    def __init__(self, hidden, num_heads, fcn_hidden, dropout=0.0):
+        super().__init__()
+        # Task 2.1 (1 point)
+        # Create layers needed for Transformer Decoder Layer
+        # 1. Create self.self_attention layer using MultiHeadAttention
+        # 2. Create self.cross_attention layer using MultiHeadAttention
+        # 2a. Which one of self_attention or cross_attention should have causal=True? Set it there.
+        # 3. Create self.att_layer_norm, self.cross_att_layer_norm, and self.fcn_layer_norm layers using LayerNorm
+        # 4. Create self.fcn network using nn.Sequential, nn.ReLU and nn.Linear
+        # 5. Create self.dropout layer using nn.Dropout
+        # YOUR CODE STARTS HERE  (our implementation is about 5-8 lines)
+        self.self_attention = MultiHeadAttention(
+            input_size=hidden,
+            hidden=hidden,
+            num_heads=num_heads,
+            causal=True,
+        )
+        self.cross_attention = MultiHeadAttention(
+            input_size=hidden,
+            hidden=hidden,
+            num_heads=num_heads,
+            causal=False,
+        )
+        self.self_att_layer_norm = nn.LayerNorm(hidden)
+        self.cross_att_layer_norm = nn.LayerNorm(hidden)
+        self.fcn = nn.Sequential(
+            nn.Linear(hidden, fcn_hidden),
+            nn.ReLU(),
+            nn.Linear(fcn_hidden, hidden),
+        )
+        self.fcn_layer_norm = nn.LayerNorm(hidden)
+        self.dropout = nn.Dropout(dropout)
+        # YOUR CODE ENDS HERE
+    def forward(self, decoder_hidden_states, encoder_hidden_states, key_padding_mask=None):
+        """Transformer Decoder Layer
+        Args:
+            decoder_hidden_states: FloatTensor[batch_size, query_seq_len, hidden]
+            encoder_hidden_states: FloatTensor[batch_size, kv_seq_len, hidden]
+            key_padding_mask: ByteTensor[batch_size, kv_seq_len] with 1 for padded tokens and 0 for regular tokens
+        Returns:
+            FloatTensor[batch_size, query_seq_len, hidden]
+        """
+        # Task 2.2 (1 point)
+        # Implement Transformer decoder block
+        # Remember that transformer decoder block is composed of:
+        # 1. Self-Attention
+        # 2. Residual connection
+        # 3. LayerNorm
+        # 4. Cross-Attention
+        # 5. Residual connection
+        # 6. LayerNorm
+        # 7. Fully-Connected Layer
+        # 8. Dropout
+        # 9. Residual connection
+        # 10. LayerNorm
+        # Note : Please write shape of the tensor for each line of code
+        # YOUR CODE STARTS HERE (our implementation is about 10 lines)
+#         print('calling decode', "decoder hidden states:",decoder_hidden_states.shape, 'encoder_hidden_states:',encoder_hidden_states.shape, "key_oadding:",key_padding_mask.shape)
+        residual_1 = decoder_hidden_states
+#         print("calling_self attention for decoder")
+        out = self.self_attention(decoder_hidden_states, key_padding_mask=None)
+        out = self.self_att_layer_norm(residual_1 + out)
+        residual_2 = out
+#         print("calling_cross attention for decoder")
+        out = self.cross_attention(q = out, kv = encoder_hidden_states, key_padding_mask = key_padding_mask)
+#         print("out after cross", out.shape)
+#         print('----')
+        out = self.cross_att_layer_norm(out+residual_2)
+        out = self.fcn(out)
+        out = self.dropout(out)
+        residual_3 = out
+        out = self.fcn_layer_norm(out+residual_3)
+        ##YOUR CODE ENDS HERE##
+        return out
+class TransfomerEncoderDecoderModel(nn.Module):
+    def __init__(
+        self,
+        *,
+        num_layers,
+        hidden,
+        num_heads,
+        fcn_hidden,
+        max_seq_len,
+        src_vocab_size,
+        tgt_vocab_size,
+        dropout=0.1,
+    ):
+        """A minimal implementation of Transformer Encoder Decoder Model
+        Args:
+            num_layer: number of layers for encoder and decoder (in total, model will have 2 * num_layers layers)
+            hidden : embedding size and hidden size of attentions
+            fcn_hidden: hidden size of fully-connected networks inside transformer layers
+            vocab_size: size of vocabulary
+            max_seq_len: maximum length of input, target sequence whichever is higher number
+            src_vocab_size : source voacb size
+            tgt_vocab_size : target voab size
+        """
+        super().__init__()
+        self.src_vocab_size = src_vocab_size
+        self.tgt_vocab_size = tgt_vocab_size
+        self.num_layers = num_layers
+        self.hidden = hidden
+        self.num_heads = num_heads
+        self.fcn_hidden = fcn_hidden
+        self.dropout_rate = dropout
+        self.max_seq_len = max_seq_len
+        # Task 2.3 (1 point)
+        # 1. Create encoder, decoder and positional embedding layer
+        # Use nn.Embedding for that and make sure to include source and target vocabulary size
+        # 2. Create a linear layer out_proj that will project contextualized representations
+        # of size hidden to your target vocabulary size.
+        # 3. Create a dropout layer
+        # YOUR CODE STARTS HERE (our implementation is about 5 lines)
+        self.encoder_embeddings = nn.Embedding(self.src_vocab_size, self.hidden)
+        self.decoder_embeddings = nn.Embedding(self.tgt_vocab_size, self.hidden)
+        self.positional_emb = nn.Embedding(self.max_seq_len, self.hidden)
+        self.out_proj = nn.Linear(self.hidden, self.tgt_vocab_size)
+        self.dropout = nn.Dropout(self.dropout_rate)
+        # YOUR CODE ENDS HERE
+        # Task 2.4 (1 point)
+        # 1. Create a list of encoder Layers
+        # 2. Create a list of decoder Layers
+        #
+        # Note that you need to wrap it with nn.ModuleList,
+        # so that the parameters of the layers would be counted as the paramertes of the model
+        # https://pytorch.org/docs/stable/generated/torch.nn.ModuleList.html
+        # Read more about ModuleList here:
+        # https://github.com/FrancescoSaverioZuppichini/Pytorch-how-and-when-to-use-Module-Sequential-ModuleList-and-ModuleDict
+        # You can use for-loop of python list comprehension to create the list of layers
+        #
+        # YOUR CODE STARTS HERE (our implementation is 3-6 lines)
+        self.encoder_layers = nn.ModuleList([TransformerEncoderLayer(hidden = self.hidden,
+                                                                    num_heads = self.num_heads,
+                                                                    fcn_hidden = self.fcn_hidden,
+                                                                    dropout=self.dropout_rate
+                                                                    )
+                                             for _ in range(self.num_layers)
+                                            ])
+        self.decoder_layers = nn.ModuleList([TransformerDecoderLayer(hidden = self.hidden,
+                                                                    num_heads = self.num_heads,
+                                                                    fcn_hidden = self.fcn_hidden,
+                                                                    dropout=self.dropout_rate
+                                                                    )
+                                             for _ in range(self.num_layers)
+                                            ])
+        # YOUR CODE ENDS HERE
+    def _add_positions(self, sequence_tensor):
+        """Adds positional embeddings to the input tensor.
+        Args:
+            sequence_tensor: FloatTensor[batch_size, seq_len, hidden]
+        """
+        seq_len = sequence_tensor.shape[1]
+        positions = torch.arange(seq_len, device=sequence_tensor.device)
+        positional_emb = self.positional_emb(positions)
+        output = sequence_tensor + positional_emb
+        return output
+    def forward(
+        self,
+        input_ids=None,
+        encoder_hidden_states=None,
+        decoder_input_ids=None,
+        key_padding_mask=None,
+    ):
+        """
+        input_ids -> encoder_emb -> encoder ->
+                                                -->  decoder(encoder_output, decoder_emb) -> logits
+        decoder_input_ids -> decoder_emb ---->
+        Model accepts either input_ids or encoder_hidden_states.
+        The former is used for training, the latter is used for inference, because during inference
+        we don't have the target sequence and want to forward the decoder multiple times.
+        To make the inference more efficient, we can only compute encoder output once and reuse it
+        for all decoder steps.
+        Meaning during training you should forward the model like this:
+            model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        but during inference (generating translation) you should forward the model like this:
+            model(encoder_hidden_states=encoder_hidden_states, decoder_input_ids=decoder_input_ids)
+        Args:
+            input_ids (LongTensor): Encoder input sequence of size (batch_size, seq_len)
+            encoder_hidden_states (FloatTensor): Encoder hidden states of size (batch_size, seq_len, hidden)
+            decoder_input_ids (LongTensor) : Decoder input sequence of size (batch_size, out_seq_len)
+            key_padding_mask (ByteTensor): Mask of size (batch_size, seq_len) where 1 means that the token is padding
+        Return:
+            logits (FloatTensor): Logits for output sequence of size (batch_size, out_seq_len, dec_vocab_size)
+        """
+        if input_ids is None and encoder_hidden_states is None:
+            raise ValueError("You should provide either input_ids or encoder_hidden_states")
+        if encoder_hidden_states is None:
+            encoder_hidden_states = self._encode(input_ids, key_padding_mask)
+        logits = self._decode(encoder_hidden_states, decoder_input_ids, key_padding_mask)
+#         print("Targte vocab size", decoder_input_ids.shape)
+#         print("logits---------", logits.shape)
+        return logits
+    def _encode(self, input_ids, key_padding_mask):
+        # Task 2.5 (2 points)
+        # 1. Get source embeddings using self.encoder_embeddings
+        # 2. Add positional embedding to encoder embeddings using _add_positions
+        # 3. Pass source embeddings through the encoder layers, name them encoder_hidden_states
+        # 3a. Remember to use key_padding_mask to mask out padding tokens
+        # YOUR CODE STARTS HERE
+        encoder_hidden_states = self.encoder_embeddings(input_ids)
+        encoder_hidden_states = self._add_positions(encoder_hidden_states)
+        for l in self.encoder_layers:
+            encoder_hidden_states = l(encoder_hidden_states, key_padding_mask = key_padding_mask)
+        # YOUR CODE ENDS HERE
+        return encoder_hidden_states
+    def _decode(self, encoder_hidden_states, decoder_input_ids, key_padding_mask):
+        # TASK 2.6 (2 points)
+        # 1. Get decoder embeddings using self.decoder_embeddings
+        # 2. Add positional embedding to target embeddings using _add_positions
+        # 3.Use decoder embeddings and encoder_hidden_states for the decoder input
+        # (please use keyword arguments instead of positional arguments to minimize a chance of a bug)
+        # 3a. Remember to use key_padding_mask to mask out padding tokens for the encoder inputs
+        # 4. use self.out_proj to get output logits, a.k.a log-probabilies of the next translation tokens
+        # YOUR CODE STARTS HERE
+        decoder_embedding =  self.decoder_embeddings(decoder_input_ids)
+        decoder_embedding = self._add_positions(decoder_embedding)
+#         print("decoder_Embedding", decoder_embedding.shape)
+        for l in self.decoder_layers:
+            decoder_embedding = l(decoder_hidden_states = decoder_embedding, encoder_hidden_states=encoder_hidden_states, key_padding_mask = key_padding_mask)
+        logits = self.out_proj(decoder_embedding)
+        ## YOUR CODE ENDS HERE
+        return logits
+    ##############################################################################
+    # Don't worry about any of the code below this line, but feel free to take a look
+    # if you are interested in generation or model saving/loading.
+    ##############################################################################
+    @torch.inference_mode()
+    def generate(
+        self,
+        input_ids,
+        *,
+        bos_token_id,
+        eos_token_id,
+        pad_token_id=None,
+        key_padding_mask=None,
+        max_length=50,
+        beam_size=5,
+        kind="beam_search",
+    ):
+        """
+        Generate a translation given an input sequence.
+        Args:
+            input_ids (LongTensor): Encoder input sequence of size (batch_size, seq_len)
+            bos_token_id (int): Beginning of sentence token id
+            eos_token_id (int): End of sentence token id
+            pad_token_id (int): Padding token id, required if doing beam search
+            key_padding_mask (ByteTensor): Mask of size (batch_size, seq_len) where 1 means that the token is padding
+            max_length (int): Maximum length of the generated sequence
+            beam_size (int): Beam size for beam search
+            kind (str): Can be either "greedy" or "beam_search"
+        Return:
+            decoded_ids (LongTensor): Decoder output sequence of size (batch_size, seq_len)
+        """
+        if kind not in ["greedy", "beam_search"]:
+            raise ValueError("Unknown kind of generation: {}".format(kind))
+        if kind == "beam_search" and pad_token_id is None:
+            raise ValueError("Beam search requires a pad_token_id to be provided")
+        if kind == "greedy":
+            return self._generate_greedy(
+                input_ids=input_ids,
+                bos_token_id=bos_token_id,
+                eos_token_id=eos_token_id,
+                key_padding_mask=key_padding_mask,
+                max_length=max_length,
+            )
+        # beam search only supports batch size 1
+        beam_search_generations = []
+        for i in range(input_ids.size(0)):
+            _input_ids = input_ids[i].unsqueeze(0)
+            _key_padding_mask = key_padding_mask[i].unsqueeze(0) if key_padding_mask is not None else None
+            generated = self._generate_beam_search(
+                input_ids=_input_ids,
+                bos_token_id=bos_token_id,
+                eos_token_id=eos_token_id,
+                key_padding_mask=_key_padding_mask,
+                max_length=max_length,
+                beam_size=beam_size,
+            )
+            beam_search_generations.append(generated[0].detach().cpu().tolist())
+        return pad(beam_search_generations, pad_id=eos_token_id)
+    @torch.inference_mode()
+    def _generate_greedy(
+        self,
+        input_ids,
+        *,
+        bos_token_id,
+        eos_token_id,
+        key_padding_mask=None,
+        max_length=50,
+    ):
+        """
+        Greedy generation of translation. Selects most likely word on every step.
+        Args:
+            input_ids (LongTensor): Encoder input sequence of size (batch_size, seq_len)
+            max_length (int): Maximum length of the generated sequence
+            bos_token_id (int): Beginning of sentence token id
+            eos_token_id (int): End of sequence token id
+        Return:
+            translation (LongTensor): Decoder output sequence of size (batch_size, out_seq_len)
+                where out_seq_len <= max_length
+        """
+        encoder_hidden_states = self._encode(input_ids, key_padding_mask)
+        decoder_input_ids = torch.full((input_ids.shape[0], 1), bos_token_id, dtype=torch.long, device=input_ids.device)
+        translation = torch.zeros((input_ids.shape[0], 0), dtype=torch.long, device=input_ids.device)
+        eos_flags = torch.zeros((input_ids.shape[0],), dtype=torch.uint8, device=input_ids.device)
+        for _ in range(max_length):
+            logits = self._decode(encoder_hidden_states, decoder_input_ids, key_padding_mask)
+            logits = logits[:, -1, :]
+            next_token_id = torch.argmax(logits, dim=-1)
+            decoder_input_ids = torch.cat((decoder_input_ids, next_token_id.unsqueeze(1)), dim=1)
+            translation = torch.cat((translation, next_token_id.unsqueeze(1)), dim=1)
+            eos_flags |= (next_token_id == eos_token_id)
+            if eos_flags.all():
+                break
+        return translation
+    @torch.inference_mode()
+    def _generate_beam_search(
+        self,
+        input_ids,
+        *,
+        bos_token_id,
+        eos_token_id,
+        key_padding_mask=None,
+        beam_size=5,
+        max_length=50,
+    ):
+        """
+        Beam search generation of translation.
+        Heavily inspired by https://github.com/pcyin/pytorch_basic_nmt
+        Args:
+            input_ids (LongTensor): Encoder input sequence of size (batch_size, seq_len)
+            max_length (int): Maximum length of the generated sequence
+            bos_token_id (int): Beginning of sentence token id
+            eos_token_id (int): End of sequence token id
+        Return:
+            translation (LongTensor): Decoder output sequence of size (batch_size, out_seq_len)
+                where out_seq_len <= max_length
+        """
+        assert len(input_ids) == 1, "Beam search is only supported for a single input sequence"
+        encoder_hidden_states = self._encode(input_ids, key_padding_mask)
+        device = input_ids.device
+        hypotheses = [[bos_token_id]]
+        hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=device)
+        completed_hypotheses = []
+        for _ in range(max_length):
+            if len(completed_hypotheses) >= beam_size:
+                break
+            hyp_num = len(hypotheses)
+            expanded_encoder_hidden_states = encoder_hidden_states.expand(
+                hyp_num,
+                encoder_hidden_states.size(1),
+                encoder_hidden_states.size(2),
+            )
+            # [batch_size*hyp_num=1*hyp_num, seq_len, hidden]
+            hypotheses_tensor = torch.tensor(hypotheses, dtype=torch.int64, device=device)
+            logits = self._decode(expanded_encoder_hidden_states, hypotheses_tensor, key_padding_mask)
+            logits = logits[:, -1, :]  # [vocab_size]
+            log_p_t = F.log_softmax(logits, dim=-1)
+            live_hyp_num = beam_size - len(completed_hypotheses)
+            # [hyp_num] -> [1, hyp_num] -> [hyp_num, vocab_size] -> [hyp_num * vocab_size]
+            new_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
+            # [live_hyp_num], [live_hyp_num]
+            # for indices, the values range from 0 to hyp_num * vocab_size
+            top_new_hyp_scores, top_new_hyp_pos = torch.topk(new_hyp_scores, k=live_hyp_num)
+            # hypotheses ids in hyp_scores tensor [hyp_num,]
+            prev_hyp_ids = torch.div(top_new_hyp_pos, self.tgt_vocab_size, rounding_mode='floor')
+            # ids of the next words for each hypothesis
+            token_ids = top_new_hyp_pos % self.tgt_vocab_size
+            new_hypotheses = []
+            new_hyp_scores = []
+            # iterate live_hyp_num times
+            for prev_hyp_id, hyp_token_id, cand_new_hyp_score in zip(prev_hyp_ids, token_ids, top_new_hyp_scores):
+                prev_hyp_id = prev_hyp_id.item()
+                hyp_token_id = hyp_token_id.item()
+                cand_new_hyp_score = cand_new_hyp_score.item()
+                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_token_id]
+                if hyp_token_id == eos_token_id:
+                    completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score))
+                else:
+                    new_hypotheses.append(new_hyp_sent)
+                    new_hyp_scores.append(cand_new_hyp_score)
+            if len(completed_hypotheses) == beam_size:
+                break
+            hypotheses = new_hypotheses
+            hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=device)
+        if len(completed_hypotheses) == 0:
+            completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item()))
+        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)
+        return torch.LongTensor(completed_hypotheses[0].value).unsqueeze(0)
+    def save_pretrained(self, save_path):
+        """Save the model weights to a directory
+        Args:
+            save_path: directory to save the model
+        """
+        config = {
+            "num_layers": self.num_layers,
+            "hidden": self.hidden,
+            "num_heads": self.num_heads,
+            "fcn_hidden": self.fcn_hidden,
+            "src_vocab_size": self.src_vocab_size,
+            "tgt_vocab_size": self.tgt_vocab_size,
+            "max_seq_len": self.max_seq_len,
+            "dropout": self.dropout_rate,
+        }
+        with open(os.path.join(save_path, "model_config.json"), "w") as f:
+           json.dump(config, f)
+        state_dict = self.state_dict()
+        torch.save(state_dict, os.path.join(save_path, "model.pt"))
+    @classmethod
+    def from_pretrained(cls, save_path, map_location=None):
+        """Load the model weights from a directory
+        Args:
+            save_path: directory to load the model
+        """
+        if map_location is None and not torch.cuda.is_available():
+            map_location = "cpu"
+        with open(os.path.join(save_path, "model_config.json"), "r") as f:
+            config = json.load(f)
+        model = cls(**config)
+        state_dict = torch.load(os.path.join(save_path, "model.pt"), map_location=map_location)
+        model.load_state_dict(state_dict)
+        return model

transformer_mt/utils.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from copy import deepcopy
+import random
+import torch
+def postprocess_text(preds, labels):
+    """Use this function to postprocess generations and labels before BLEU computation."""
+    preds = [pred.strip() for pred in preds]
+    labels = [[label.strip()] for label in labels]
+    return preds, labels
+def pad(sequence_list, pad_id):
+    """Pads sequence_list to the longest sequence in the batch with pad_id.
+    Args:
+        sequence_list: a list of size batch_size of numpy arrays of different length
+        pad_id: int, a pad token id
+    Returns:
+        torch.LongTensor of shape [batch_size, max_sequence_len]
+    """
+    max_len = max(len(x) for x in sequence_list)
+    padded_sequence_list = []
+    for sequence in sequence_list:
+        padding = [pad_id] * (max_len - len(sequence))
+        padded_sequence = sequence + padding
+        padded_sequence_list.append(padded_sequence)
+    return torch.LongTensor(padded_sequence_list)
+def sample_small_debug_dataset(raw_datasets):
+    random_indices = random.sample(list(range(len(raw_datasets["train"]))), 100)
+    subset = raw_datasets["train"].select(random_indices)
+    raw_datasets["train"] = deepcopy(subset)
+    if "validation" in raw_datasets:
+        raw_datasets["validation"] = deepcopy(subset)
+    if "test" in raw_datasets:
+        raw_datasets["test"] = deepcopy(subset)
+    return raw_datasets

transformer_mt_roberta/__init__.py ADDED Viewed

File without changes

transformer_mt_roberta/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (168 Bytes). View file

transformer_mt_roberta/__pycache__/modeling_attention.cpython-37.pyc ADDED Viewed

Binary file (2.96 kB). View file

transformer_mt_roberta/__pycache__/modeling_transformer.cpython-37.pyc ADDED Viewed

Binary file (11.4 kB). View file

transformer_mt_roberta/__pycache__/modeling_transformer_final.cpython-37.pyc ADDED Viewed

Binary file (8.15 kB). View file

transformer_mt_roberta/__pycache__/utils.cpython-37.pyc ADDED Viewed

Binary file (1.79 kB). View file

transformer_mt_roberta/modeling_attention.py ADDED Viewed

	@@ -0,0 +1,126 @@

+#/usr/bin/env python
+# coding=utf-8
+# Copyright 2022 Vladislav Lialin and Namrata Shivagunde
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#i Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+class MultiHeadAttention(nn.Module):
+    def __init__(self, input_size, hidden, num_heads, causal=False):
+        """Multi-head attention module which computes [softmax(xQ_h @ xK_h^T) @ xV: ...] @ U
+        Can work as both self-attention or cross-attention (if kv is provided to .forward).
+        Args:
+            causal: use causal masking (do not allow target to look to the future or current token of source)
+        """
+        if hidden % num_heads:
+            raise ValueError(f"hidden should be divisible by num_heads, "
+                             f"but got hidden={hidden} and num_heads={num_heads}")
+        super().__init__()
+        self.k = nn.Linear(input_size, hidden)
+        self.q = nn.Linear(input_size, hidden)
+        self.v = nn.Linear(input_size, hidden)
+        self.mix = nn.Linear(hidden, hidden)
+        self.num_heads = num_heads
+        self.head_size = hidden // num_heads
+        self.scale = self.head_size ** 0.5
+        self.causal = causal  # causal masking
+    def forward(self, q, kv=None, key_padding_mask=None, return_attention=False):
+        """[Softmax(source Q_1 @ target K_1^T) @ target V_1 : ... ) @ x V_heads] @ U
+        Performs self-attention if kv is not specified.
+        In this case, kv = q and kv_seq_len = query_seq_len.
+        Args:
+            q: FloatTensor[batch_size, query_seq_len, input_size]
+            kv (target) : optional, FloatTensor[batch_size, kv_seq_len, input_size]
+            key_padding_mask: BoolTensor[batch_size, kv_seq_len] 0 means unpadded, 1 means padded
+        Returns:
+            FloatTensor[batch_size, seq_len, hidden]
+        """
+        # Task 1.1 (1 point)
+        # Update this function with cross-attention mechanism
+        # If target is None, then target (kv) and source (q) will be same.
+        # Define k, q, v using self.k, self.q and self.v based on if the target exists or not
+        # Note : Please write shape of each tensor for each line of code
+        ## YOUR CODE STARTS HERE## ~ 2 lines code
+        k = self.k(kv) if kv!=None else self.k(q)
+#         print('k', k.shape, 'q', q.shape)
+        q = self.q(q)
+        v = self.v(kv) if kv!=None else self.v(q)
+#         print("KV", kv)
+        # YOUR CODE ENDS HERE
+        bs, attending_seq, _ = q.shape
+        attended_seq = k.shape[1]
+        # [b, s, h] -> [b, h, s] -> [b * heads, h / heads, s] -> [b * heads, s, h / heads]
+        k = k.transpose(1, 2).reshape(bs * self.num_heads, self.head_size, -1).transpose(1, 2).contiguous()  # [batch * num_heads, seq, hidden / num_heads]
+        q = q.transpose(1, 2).reshape(bs * self.num_heads, self.head_size, -1).transpose(1, 2).contiguous()
+        v = v.transpose(1, 2).reshape(bs * self.num_heads, self.head_size, -1).transpose(1, 2).contiguous()
+        scores = q @ k.transpose(1, 2) / self.scale  # [batch * num_heads, attending_seq, attended_seq]
+        assert scores.shape == (bs * self.num_heads, attending_seq, attended_seq)
+        if key_padding_mask is not None:
+            # Task 1.2 (1 point)
+            # Padding
+            # Set the scores corresponding to padded positions (key_padding_mask == 1) to -inf
+            #
+            # You might need to reshape the scores to [batch_size, seq_len, seq_len]
+            # in this case, remember to reshape them back
+            # Our implementation is 3 lines
+            # YOUR CODE STARTS HERE
+#             print(scores.shape, key_padding_mask.unsqueeze(-2).shape)
+            scores = scores.reshape(self.num_heads, bs,  attending_seq, attended_seq)
+            scores_check = scores.reshape(bs, self.num_heads, attending_seq, -1)
+           # print("Socres:", scores.shape, "Scores_Check:", scores_check.shape)
+#             print('----')
+            scores = scores.masked_fill(key_padding_mask.unsqueeze(-2)==1, value = float("-inf"))
+            scores = scores.view(bs * self.num_heads, attending_seq, attended_seq)
+            # YOUR CODE ENDS HERE
+        assert scores.size() == (bs * self.num_heads, attending_seq, attended_seq),\
+            f"scores have wrong shape. Expected {(bs * self.num_heads, attending_seq, attended_seq)}, got {scores.size()}"
+        if self.causal:
+            causal_mask = torch.triu(torch.ones(attending_seq, attended_seq, dtype=torch.bool, device=scores.device), diagonal=1)
+            scores.masked_fill_(causal_mask.bool().unsqueeze(0), float("-inf"))
+        probs = torch.softmax(scores, dim=-1)  # [batch * num_heads, tgt_seq, src_seq]
+        att = probs @ v  # [batch * num_heads, tgt_seq, hidden / num_heads]
+        # [b * heads, s, h / heads] -> [b * heads, h / heads, s] -> [b, h, s] -> [b, s, h]
+        att = att.transpose(1, 2).reshape(bs, -1, attending_seq).transpose(1, 2).contiguous()
+        att = self.mix(att)
+        if return_attention:
+            return att, probs
+        return att

transformer_mt_roberta/modeling_transformer_final.py ADDED Viewed

	@@ -0,0 +1,353 @@

+import os
+import json
+from collections import namedtuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformer_mt.modeling_attention import MultiHeadAttention
+from transformer_mt.utils import pad
+from transformers import AutoTokenizer, AutoModelForMaskedML
+Hypothesis = namedtuple("Hypothesis", ["value", "score"])
+class TransformerDecoderLayer(nn.Module):
+    def __init__(self, hidden, num_heads, fcn_hidden, dropout=0.0):
+        super().__init__()
+        self.self_attention = MultiHeadAttention(
+            input_size=hidden,
+            hidden=hidden,
+            num_heads=num_heads,
+            causal=True,
+        )
+        self.cross_attention = MultiHeadAttention(
+            input_size=hidden,
+            hidden=hidden,
+            num_heads=num_heads,
+            causal=False,
+        )
+        self.self_att_layer_norm = nn.LayerNorm(hidden)
+        self.cross_att_layer_norm = nn.LayerNorm(hidden)
+        self.fcn = nn.Sequential(
+            nn.Linear(hidden, fcn_hidden),
+            nn.ReLU(),
+            nn.Linear(fcn_hidden, hidden),
+        )
+        self.fcn_layer_norm = nn.LayerNorm(hidden)
+        self.dropout = nn.Dropout(dropout)
+        # YOUR CODE ENDS HERE
+    def forward(self, decoder_hidden_states, encoder_hidden_states, key_padding_mask=None):
+        residual_1 = decoder_hidden_states
+        out = self.self_attention(decoder_hidden_states, key_padding_mask=None)
+        out = self.self_att_layer_norm(residual_1 + out)
+        residual_2 = out
+        out = self.cross_attention(q = out, kv = encoder_hidden_states, key_padding_mask = key_padding_mask)
+        out = self.cross_att_layer_norm(out+residual_2)
+        out = self.fcn(out)
+        out = self.dropout(out)
+        residual_3 = out
+        out = self.fcn_layer_norm(out+residual_3)
+        return out
+class TransfomerEncoderDecoderModel(nn.Module):
+    def __init__(
+        self,
+        *,
+        num_layers,
+        hidden,
+        num_heads,
+        fcn_hidden,
+        max_seq_len,
+        src_vocab_size,
+        tgt_vocab_size,
+        dropout=0.1,
+    ):
+        super().__init__()
+        self.src_vocab_size = src_vocab_size
+        self.tgt_vocab_size = tgt_vocab_size
+        self.num_layers = num_layers
+        self.hidden = hidden
+        self.num_heads = num_heads
+        self.fcn_hidden = fcn_hidden
+        self.dropout_rate = dropout
+        self.max_seq_len = max_seq_len
+        self.decoder_embeddings = nn.Embedding(self.tgt_vocab_size, self.hidden)
+        self.positional_emb = nn.Embedding(self.max_seq_len, self.hidden)
+        self.out_proj = nn.Linear(self.hidden, self.tgt_vocab_size)
+        self.dropout = nn.Dropout(self.dropout_rate)
+        self.encoder = AutoModelForMaskedML.from_pretrained("flax-community/roberta_base_danish", output_hidden_states=True)
+        self.decoder_layers = nn.ModuleList([TransformerDecoderLayer(hidden = self.hidden,
+                                                                    num_heads = self.num_heads,
+                                                                    fcn_hidden = self.fcn_hidden,
+                                                                    dropout=self.dropout_rate
+                                                                    )
+                                             for _ in range(self.num_layers)
+                                            ])
+        # YOUR CODE ENDS HERE
+    def _add_positions(self, sequence_tensor):
+        seq_len = sequence_tensor.shape[1]
+        positions = torch.arange(seq_len, device=sequence_tensor.device)
+        positional_emb = self.positional_emb(positions)
+        output = sequence_tensor + positional_emb
+        return output
+    def forward(
+        self,
+        input_ids=None,
+        encoder_hidden_states=None,
+        decoder_input_ids=None,
+        key_padding_mask=None,
+    ):
+        if input_ids is None and encoder_hidden_states is None:
+            raise ValueError("You should provide either input_ids or encoder_hidden_states")
+        if encoder_hidden_states is None:
+            encoder_hidden_states = self.encoder(input_ids, output_hidden_states=True)
+            encoder_hidden_states = encoder_hidden_states.hidden_states[-1]
+#             print( encoder_hidden_states.shape)
+        logits = self._decode(encoder_hidden_states, decoder_input_ids, key_padding_mask)
+#         print(logits.shape)
+        return logits
+    def _decode(self, encoder_hidden_states, decoder_input_ids, key_padding_mask):
+        decoder_embedding =  self.decoder_embeddings(decoder_input_ids)
+        decoder_embedding = self._add_positions(decoder_embedding)
+        for l in self.decoder_layers:
+            decoder_embedding = l(decoder_hidden_states = decoder_embedding, encoder_hidden_states=encoder_hidden_states, key_padding_mask = key_padding_mask)
+        logits = self.out_proj(decoder_embedding)
+        ## YOUR CODE ENDS HERE
+        return logits
+    @torch.inference_mode()
+    def generate(
+        self,
+        input_ids,
+        *,
+        bos_token_id,
+        eos_token_id,
+        pad_token_id=None,
+        key_padding_mask=None,
+        max_length=50,
+        beam_size=5,
+        kind="beam_search",
+    ):
+        if kind not in ["greedy", "beam_search"]:
+            raise ValueError("Unknown kind of generation: {}".format(kind))
+        if kind == "beam_search" and pad_token_id is None:
+            raise ValueError("Beam search requires a pad_token_id to be provided")
+        if kind == "greedy":
+            return self._generate_greedy(
+                input_ids=input_ids,
+                bos_token_id=bos_token_id,
+                eos_token_id=eos_token_id,
+                key_padding_mask=key_padding_mask,
+                max_length=max_length,
+            )
+        # beam search only supports batch size 1
+        beam_search_generations = []
+        for i in range(input_ids.size(0)):
+            _input_ids = input_ids[i].unsqueeze(0)
+            _key_padding_mask = key_padding_mask[i].unsqueeze(0) if key_padding_mask is not None else None
+            generated = self._generate_beam_search(
+                input_ids=_input_ids,
+                bos_token_id=bos_token_id,
+                eos_token_id=eos_token_id,
+                key_padding_mask=_key_padding_mask,
+                max_length=max_length,
+                beam_size=beam_size,
+            )
+            beam_search_generations.append(generated[0].detach().cpu().tolist())
+        return pad(beam_search_generations, pad_id=eos_token_id)
+    @torch.inference_mode()
+    def _generate_greedy(
+        self,
+        input_ids,
+        *,
+        bos_token_id,
+        eos_token_id,
+        key_padding_mask=None,
+        max_length=50,
+    ):
+       # encoder_hidden_states = self._encode(input_ids, key_padding_mask)
+        encoder_hidden_states = self.encoder(input_ids, output_hidden_states=True, attention_mask=key_padding_mask)
+        encoder_hidden_states = encoder_hidden_states.hidden_states[-1]
+        decoder_input_ids = torch.full((input_ids.shape[0], 1), bos_token_id, dtype=torch.long, device=input_ids.device)
+        translation = torch.zeros((input_ids.shape[0], 0), dtype=torch.long, device=input_ids.device)
+        eos_flags = torch.zeros((input_ids.shape[0],), dtype=torch.uint8, device=input_ids.device)
+        for _ in range(max_length):
+            logits = self._decode(encoder_hidden_states, decoder_input_ids, key_padding_mask)
+            logits = logits[:, -1, :]
+            next_token_id = torch.argmax(logits, dim=-1)
+            decoder_input_ids = torch.cat((decoder_input_ids, next_token_id.unsqueeze(1)), dim=1)
+            translation = torch.cat((translation, next_token_id.unsqueeze(1)), dim=1)
+            eos_flags |= (next_token_id == eos_token_id)
+            if eos_flags.all():
+                break
+        return translation
+    @torch.inference_mode()
+    def _generate_beam_search(
+        self,
+        input_ids,
+        *,
+        bos_token_id,
+        eos_token_id,
+        key_padding_mask=None,
+        beam_size=5,
+        max_length=50,
+    ):
+        assert len(input_ids) == 1, "Beam search is only supported for a single input sequence"
+        #encoder_hidden_states = self._encode(input_ids, key_padding_mask)
+        encoder_hidden_states = self.encoder(input_ids, output_hidden_states=True, attention_mask=key_padding_mask)
+        encoder_hidden_states = encoder_hidden_states.hidden_states[-1]
+        device = input_ids.device
+        hypotheses = [[bos_token_id]]
+        hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=device)
+        completed_hypotheses = []
+        for _ in range(max_length):
+            if len(completed_hypotheses) >= beam_size:
+                break
+            hyp_num = len(hypotheses)
+            expanded_encoder_hidden_states = encoder_hidden_states.expand(
+                hyp_num,
+                encoder_hidden_states.size(1),
+                encoder_hidden_states.size(2),
+            )
+            # [batch_size*hyp_num=1*hyp_num, seq_len, hidden]
+            hypotheses_tensor = torch.tensor(hypotheses, dtype=torch.int64, device=device)
+            logits = self._decode(expanded_encoder_hidden_states, hypotheses_tensor, key_padding_mask)
+            logits = logits[:, -1, :]  # [vocab_size]
+            log_p_t = F.log_softmax(logits, dim=-1)
+            live_hyp_num = beam_size - len(completed_hypotheses)
+            # [hyp_num] -> [1, hyp_num] -> [hyp_num, vocab_size] -> [hyp_num * vocab_size]
+            new_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
+            # [live_hyp_num], [live_hyp_num]
+            # for indices, the values range from 0 to hyp_num * vocab_size
+            top_new_hyp_scores, top_new_hyp_pos = torch.topk(new_hyp_scores, k=live_hyp_num)
+            # hypotheses ids in hyp_scores tensor [hyp_num,]
+            prev_hyp_ids = torch.div(top_new_hyp_pos, self.tgt_vocab_size, rounding_mode='floor')
+            # ids of the next words for each hypothesis
+            token_ids = top_new_hyp_pos % self.tgt_vocab_size
+            new_hypotheses = []
+            new_hyp_scores = []
+            # iterate live_hyp_num times
+            for prev_hyp_id, hyp_token_id, cand_new_hyp_score in zip(prev_hyp_ids, token_ids, top_new_hyp_scores):
+                prev_hyp_id = prev_hyp_id.item()
+                hyp_token_id = hyp_token_id.item()
+                cand_new_hyp_score = cand_new_hyp_score.item()
+                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_token_id]
+                if hyp_token_id == eos_token_id:
+                    completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score))
+                else:
+                    new_hypotheses.append(new_hyp_sent)
+                    new_hyp_scores.append(cand_new_hyp_score)
+            if len(completed_hypotheses) == beam_size:
+                break
+            hypotheses = new_hypotheses
+            hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=device)
+        if len(completed_hypotheses) == 0:
+            completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item()))
+        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)
+        return torch.LongTensor(completed_hypotheses[0].value).unsqueeze(0)
+    def save_pretrained(self, save_path):
+        """Save the model weights to a directory
+        Args:
+            save_path: directory to save the model
+        """
+        config = {
+            "num_layers": self.num_layers,
+            "hidden": self.hidden,
+            "num_heads": self.num_heads,
+            "fcn_hidden": self.fcn_hidden,
+            "src_vocab_size": self.src_vocab_size,
+            "tgt_vocab_size": self.tgt_vocab_size,
+            "max_seq_len": self.max_seq_len,
+            "dropout": self.dropout_rate,
+        }
+        with open(os.path.join(save_path, "model_config.json"), "w") as f:
+           json.dump(config, f)
+        state_dict = self.state_dict()
+        torch.save(state_dict, os.path.join(save_path, "model.pt"))
+    @classmethod
+    def from_pretrained(cls, save_path, map_location=None):
+        """Load the model weights from a directory
+        Args:
+            save_path: directory to load the model
+        """
+        if map_location is None and not torch.cuda.is_available():
+            map_location = "cpu"
+        with open(os.path.join(save_path, "model_config.json"), "r") as f:
+            config = json.load(f)
+        model = cls(**config)
+        state_dict = torch.load(os.path.join(save_path, "model.pt"), map_location=map_location)
+        model.load_state_dict(state_dict)
+        return model

transformer_mt_roberta/utils.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from copy import deepcopy
+import random
+import torch
+def postprocess_text(preds, labels):
+    """Use this function to postprocess generations and labels before BLEU computation."""
+    preds = [pred.strip() for pred in preds]
+    labels = [[label.strip()] for label in labels]
+    return preds, labels
+def pad(sequence_list, pad_id):
+    """Pads sequence_list to the longest sequence in the batch with pad_id.
+    Args:
+        sequence_list: a list of size batch_size of numpy arrays of different length
+        pad_id: int, a pad token id
+    Returns:
+        torch.LongTensor of shape [batch_size, max_sequence_len]
+    """
+    max_len = max(len(x) for x in sequence_list)
+    padded_sequence_list = []
+    for sequence in sequence_list:
+        padding = [pad_id] * (max_len - len(sequence))
+        padded_sequence = sequence + padding
+        padded_sequence_list.append(padded_sequence)
+    return torch.LongTensor(padded_sequence_list)
+def sample_small_debug_dataset(raw_datasets):
+    random_indices = random.sample(list(range(len(raw_datasets["train"]))), 100)
+    subset = raw_datasets["train"].select(random_indices)
+    raw_datasets["train"] = deepcopy(subset)
+    if "validation" in raw_datasets:
+        raw_datasets["validation"] = deepcopy(subset)
+    if "test" in raw_datasets:
+        raw_datasets["test"] = deepcopy(subset)
+    return raw_datasets