Spaces:

msarmi9
/

multi30k

Runtime error

App Files Files Community

msarmi9 commited on Feb 16, 2022

Commit

8c7a320

•

1 Parent(s): cc430e7

initial commit

Browse files

Files changed (4) hide show

app.py +75 -0
attention.py +53 -0
models.py +162 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import random
+from typing import *
+import gradio as gr
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+import sentencepiece as sp
+import torch
+from huggingface_hub import hf_hub_download
+from torchtext.datasets import Multi30k
+from models import Seq2Seq
+# Load model
+model_path = hf_hub_download("msarmi9/multi30k", "models/de-en/model.bin")
+model = Seq2Seq(vocab_size=8000, hidden_dim=512, bos_idx=1, eos_idx=2, pad_idx=3, temperature=2)
+model.load_state_dict(torch.load(model_path))
+model.eval()
+# Load sentencepiece tokenizers
+source_spm_path = hf_hub_download("msarmi9/multi30k", "models/de-en/de8000.model")
+target_spm_path = hf_hub_download("msarmi9/multi30k", "models/de-en/en8000.model")
+source_spm = sp.SentencePieceProcessor(model_file=source_spm_path, add_eos=True)
+target_spm = sp.SentencePieceProcessor(model_file=target_spm_path, add_eos=True)
+# Load test set for example inputs
+normalize = lambda sample: (sample[0].lower().strip(), sample[1].lower().strip())
+test_source, _ = zip(*map(normalize, Multi30k(split="test", language_pair=("de", "en"))))
+def attention_heatmap(input_tokens: List[str], output_tokens: List[str], weights: np.ndarray) -> plt.Figure:
+    figure = plt.figure(dpi=800, tight_layout=True)
+    axes = sns.heatmap(weights, cmap="gray", cbar=False)
+    axes.set_xticklabels(input_tokens, rotation=90)
+    axes.set_yticklabels(output_tokens, rotation=0)
+    axes.tick_params(axis="both", length=0)
+    axes.xaxis.tick_top()
+    plt.close()
+    return figure
+@torch.inference_mode()
+def run(input: str) -> Tuple[str, plt.Figure]:
+    """Run inference on a single sentence. Returns prediction and attention heatmap."""""
+    input_tensor = torch.tensor(source_spm.encode(input), dtype=torch.int64)
+    output, weights = model.decode(input_tensor, max_decode_length=max(len(input_tensor), 80))
+    output = target_spm.decode(output.detach().tolist())
+    input_tokens = source_spm.encode(input, out_type=str)
+    output_tokens = target_spm.encode(output, out_type=str)
+    return output, attention_heatmap(input_tokens, output_tokens, weights.detach().numpy())
+if __name__ == "__main__":
+    interface = gr.Interface(
+        run,
+        inputs=gr.inputs.Textbox(lines=4, label="German"),
+        outputs=[
+            gr.outputs.Textbox(label="English"),
+            gr.outputs.Image(type="plot", label="Attention Heatmap"),
+        ],
+        title = "Multi30k Translation Widget",
+        examples=random.sample(test_source, k=30),
+        examples_per_page=10,
+        allow_flagging="never",
+        theme="huggingface",
+        live=True,
+    )
+    interface.launch(
+        enable_queue=True,
+        cache_examples=True,
+    )

attention.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+Tensor = torch.Tensor
+class Attention(nn.Module):
+    """Container for applying an attention scoring function."""""
+    def __init__(self, score: nn.Module, dropout: nn.Module = None):
+        super().__init__()
+        self.score = score
+        self.dropout = dropout
+    def forward(self, decoder_state: Tensor, encoder_state: Tensor, source_mask: Tensor = None) -> Tuple[Tensor, Tensor]:
+        """Return context and attention weights. Accepts a boolean mask indicating padding in the source sequence."""""
+        (B, L, D), (B, T, _) = decoder_state.shape, encoder_state.shape
+        scores = self.score(decoder_state, encoder_state)                 # (B, L, T)
+        if source_mask is not None:                                       # (B, T)
+            scores.masked_fill_(source_mask.view(B, 1, T), -1e4)
+        weights = F.softmax(scores, dim=-1)                               # (B, L, T)
+        if self.dropout is not None:
+            weights = self.dropout(weights)
+        context = weights @ encoder_state                                 # (B, L, _)
+        return context, weights                                           # (B, L, _), (B, L, T)
+class ConcatScore(nn.Module):
+    """A two layer network as an attention scoring function. Expects bidirectional encoder."""""
+    def __init__(self, d: int):
+        super().__init__()
+        self.w = nn.Linear(3*d, d)
+        self.v = nn.Linear(d, 1, bias=False)
+        self.initialize_parameters()
+    def forward(self, decoder_state: Tensor, encoder_state: Tensor) -> Tensor:
+        """Return attention scores."""""
+        (B, L, D), (B, T, _) = decoder_state.shape, encoder_state.shape    # (B, L, D), (B, T, 2*D)
+        decoder_state = decoder_state.repeat_interleave(T, dim=1)          # (B, L*T, D)
+        encoder_state = encoder_state.repeat(1, L, 1)                      # (B, L*T, 2*D)
+        concatenated = torch.cat((decoder_state, encoder_state), dim=-1)   # (B, L*T, 3*D)
+        scores = self.v(torch.tanh(self.w(concatenated)))                  # (B, L*T, 1)
+        return scores.view(B, L, T)                                        # (B, L, T)
+    @torch.no_grad()
+    def initialize_parameters(self):
+       nn.init.xavier_uniform_(self.w.weight)
+       nn.init.xavier_uniform_(self.v.weight, gain=nn.init.calculate_gain("tanh"))
+       nn.init.zeros_(self.w.bias)

models.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from typing import *
+import torch
+import torch.nn as nn
+from attention import Attention
+from attention import ConcatScore
+Tensor = torch.Tensor
+class Encoder(nn.Module):
+    """Single layer recurrent bidirectional encoder."""""
+    def __init__(self, vocab_size: int, hidden_dim: int, pad_idx: int):
+        super().__init__()
+        self.embedding = nn.Sequential(
+            OrderedDict(
+                embedding=nn.Embedding(vocab_size, hidden_dim, padding_idx=pad_idx),
+                dropout=nn.Dropout(p=0.33),
+            )
+        )
+        self.gru = nn.GRU(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
+        self.fc = nn.Linear(2*hidden_dim, hidden_dim)
+        self.initialize_parameters()
+    def forward(self, input: Tensor) -> Tuple[Tensor, Tensor]:
+        """Encode a sequence of tokens as a sequence of hidden states."""""
+        B, T = input.shape
+        embedded = self.embedding(input)                     # (B, T, D)
+        output, hidden = self.gru(embedded)                  # (B, T, 2*D), (2, B, D)
+        hidden = torch.cat((hidden[0], hidden[1]), dim=-1)   # (B, 2*D)
+        hidden = torch.tanh(self.fc(hidden))                 # (B, D)
+        return output, hidden.unsqueeze(0)                   # (B, T, 2*D), (1, B, D)
+    @torch.no_grad()
+    def initialize_parameters(self):
+        """Initialize linear weights uniformly, recurrent weights orthogonally, and bias to zero."""""
+        for name, parameters in self.named_parameters():
+            if "embedding" in name:
+                nn.init.xavier_uniform_(parameters)
+            elif "weight_ih" in name:
+                w_ir, w_iz, w_in = torch.chunk(parameters, chunks=3, dim=0)
+                nn.init.xavier_uniform_(w_ir)
+                nn.init.xavier_uniform_(w_iz)
+                nn.init.xavier_uniform_(w_in)
+            elif "weight_hh" in name:
+                w_hr, w_hz, w_hn = torch.chunk(parameters, chunks=3, dim=0)
+                nn.init.orthogonal_(w_hr)
+                nn.init.orthogonal_(w_hz)
+                nn.init.orthogonal_(w_hn)
+            elif "weight" in name:
+                nn.init.xavier_uniform_(parameters)
+            elif "bias" in name:
+                nn.init.zeros_(parameters)
+class Decoder(nn.Module):
+    """Single layer recurrent decoder."""""
+    def __init__(self, vocab_size: int, hidden_dim: int, pad_idx: int, temperature: float = 1.0):
+        super().__init__()
+        self.embedding = nn.Sequential(
+            OrderedDict(
+                embedding=nn.Embedding(vocab_size, hidden_dim, padding_idx=pad_idx),
+                dropout=nn.Dropout(p=0.33),
+            )
+        )
+        self.attention = Attention(ConcatScore(hidden_dim), nn.Dropout(p=0.1))
+        self.gru = nn.GRU(3*hidden_dim, hidden_dim, batch_first=True)
+        self.fc = nn.Sequential(
+            OrderedDict(
+                fc1=nn.Linear(4*hidden_dim, hidden_dim),
+                layer_norm=nn.LayerNorm(hidden_dim),
+                gelu=nn.GELU(),
+                fc2=nn.Linear(hidden_dim, vocab_size, bias=False),
+            )
+        )
+        self.fc.fc2.weight = self.embedding.embedding.weight
+        self.temperature = temperature
+        self.initialize_parameters()
+    def forward(self, input: Tensor, hidden: Tensor, encoder_output: Tensor, source_mask: Tensor = None) -> Tuple[Tensor, Tensor, Tensor]:
+        """Predict the next token given an input token. Returns unnormalized predictions over the vocabulary."""""
+        B, = input.shape                                                                           # L=1
+        embedded = self.embedding(input.view(B, 1))                                                # (B, 1, D)
+        context, weights = self.attention(hidden.view(B, 1, -1), encoder_output, source_mask)      # (B, 1, 2*D), (B, 1, T)
+        output, hidden = self.gru(torch.cat((embedded, context), dim=-1), hidden)                  # (B, 1, D), (1, B, D)
+        predictions = self.fc(torch.cat((embedded, context, output), dim=-1)) / self.temperature   # (B, 1, V)
+        return predictions.view(B, -1), hidden, weights.view(B, -1)                                # (B, V), (1, B, D), (B, T)
+    @torch.no_grad()
+    def initialize_parameters(self):
+        """Initialize linear weights uniformly, recurrent weights orthogonally, and bias to zero."""""
+        for name, parameters in self.named_parameters():
+            if "norm" in name:
+                continue
+            elif "embedding" in name:
+                nn.init.xavier_uniform_(parameters)
+            elif "weight_ih" in name:
+                w_ir, w_iz, w_in = torch.chunk(parameters, chunks=3, dim=0)
+                nn.init.xavier_uniform_(w_ir)
+                nn.init.xavier_uniform_(w_iz)
+                nn.init.xavier_uniform_(w_in)
+            elif "weight_hh" in name:
+                w_hr, w_hz, w_hn = torch.chunk(parameters, chunks=3, dim=0)
+                nn.init.orthogonal_(w_hr)
+                nn.init.orthogonal_(w_hz)
+                nn.init.orthogonal_(w_hn)
+            elif "weight" in name:
+                nn.init.xavier_uniform_(parameters)
+            elif "bias" in name:
+                nn.init.zeros_(parameters)
+class Seq2Seq(nn.Module):
+    """Seq2seq with attention."""""
+    def __init__(self, vocab_size: int, hidden_dim: int, bos_idx: int, eos_idx: int, pad_idx: int, teacher_forcing: float = 0.5, temperature: float = 1.0):
+        super().__init__()
+        self.encoder = Encoder(vocab_size, hidden_dim, pad_idx)
+        self.decoder = Decoder(vocab_size, hidden_dim, pad_idx, temperature=temperature)
+        self.bos_idx = bos_idx
+        self.eos_idx = eos_idx
+        self.pad_idx = pad_idx
+        self.teacher_forcing = teacher_forcing
+    def forward(self, source: Tensor, target: Tensor) -> Tensor:
+        """Forward pass at training time. Returns unnormalized predictions over the vocabulary."""""
+        (B, T), (B, L) = source.shape, target.shape
+        encoder_output, hidden = self.encoder(source)                          # (B, T, D), (1, B, D)
+        decoder_input = torch.full((B,), self.bos_idx, device=source.device)   # (B,)
+        source_mask = source == self.pad_idx                                   # (B, 1, T)
+        output = []
+        for i in range(L):
+            predictions, hidden, _ = self.decoder(decoder_input, hidden, encoder_output, source_mask)   # (B, V), (1, B, D)
+            output.append(predictions)
+            if self.training and random.random() < self.teacher_forcing:
+                decoder_input = target[:,i]                                    # (B,)
+            else:
+                decoder_input = predictions.argmax(dim=1)                      # (B,)
+        return torch.stack(output, dim=1)                                      # (B, L, V)
+    @torch.inference_mode()
+    def decode(self, source: Tensor, max_decode_length: int) -> Tuple[Tensor, Tensor]:
+        """Decode a single sequence at inference time.  Returns output sequence and attention weights."""""
+        B, (T,) = 1, source.shape
+        encoder_output, hidden = self.encoder(source.view(B, T))               # (B, T, D), (B, 1, D)
+        decoder_input = torch.full((B,), self.bos_idx, device=source.device)   # (B,)
+        output, attention = [], []
+        for i in range(max_decode_length):
+            predictions, hidden, weights = self.decoder(decoder_input, hidden, encoder_output)   # (B, V), (1, B, D), (B, T)
+            output.append(predictions.argmax(dim=-1))                          # (B,)
+            attention.append(weights)                                          # (B, T)
+            if output[i] == self.eos_idx:
+                break
+            else:
+                decoder_input = output[i]                                      # (B,)
+        return torch.cat(output, dim=0), torch.cat(attention, dim=0)           # (L,), (L, T)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio
+huggingface_hub
+matplotlib
+numpy
+seaborn
+sentencepiece
+torch
+torchtext