Spaces:

MrVicente
/

RA-BART

Runtime error

App Files Files Community

MrVicente commited on Sep 14, 2022

Commit

6cf191b

•

1 Parent(s): be59fc7

added demo base code

Browse files

Files changed (40) hide show

.gitattributes +3 -0
__init__.py +0 -0
app.py +87 -0
custom_bart/__init__.py +12 -0
custom_bart/attention_utils.py +132 -0
custom_bart/bart_attention.py +313 -0
custom_bart/bart_for_conditional_generation.py +205 -0
custom_bart/bart_generation_mixin.py +0 -0
custom_bart/bart_mask_attention.py +238 -0
custom_bart/bart_model.py +169 -0
custom_bart/bart_onnx.py +240 -0
custom_bart/config.py +197 -0
custom_bart/custom_constants.py +168 -0
custom_bart/custom_outputs.py +142 -0
custom_bart/decoder.py +312 -0
custom_bart/decoder_layer.py +134 -0
custom_bart/encoder.py +216 -0
custom_bart/encoder_layer.py +102 -0
custom_tokenizer/__init__.py +1 -0
custom_tokenizer/bart_custom_tokenizer_fast.py +484 -0
data/__init__.py +0 -0
data/relation_utils.py +53 -0
inference.py +349 -0
kgs_binding/__init__.py +3 -0
kgs_binding/conceptnet/__init__.py +1 -0
kgs_binding/conceptnet/conceptnet_english_noun_2_noun_relations.json +3 -0
kgs_binding/conceptnet/conceptnet_english_nouns.json +3 -0
kgs_binding/conceptnet/conceptnet_english_nouns_simple.json +3 -0
kgs_binding/conceptnet_handler.py +61 -0
kgs_binding/english_stopwords.txt +1126 -0
kgs_binding/kg_base_wrapper.py +80 -0
kgs_binding/kg_qa_binding_utils.py +73 -0
kgs_binding/parsing_utils.py +86 -0
kgs_binding/relation_mapper_builder.py +164 -0
kgs_binding/swow/__init__.py +1 -0
kgs_binding/swow/swow_knowledge.json +0 -0
kgs_binding/swow_handler.py +75 -0
model_utils.py +54 -0
requirements.txt +4 -0
utils.py +230 -0

.gitattributes CHANGED Viewed

@@ -29,3 +29,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+kgs_binding/conceptnet/conceptnet_english_noun_2_noun_relations.json filter=lfs diff=lfs merge=lfs -text
+kgs_binding/conceptnet/conceptnet_english_nouns.json filter=lfs diff=lfs merge=lfs -text
+kgs_binding/conceptnet/conceptnet_english_nouns_simple.json filter=lfs diff=lfs merge=lfs -text

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import gradio as gr
+import matplotlib.pyplot as plt
+from inference import RelationsInference
+from utils import KGType,Model_Type
+#############################
+#   Constants
+#############################
+examples = [["What's the meaning of life?", "eli5", "constraint"],
+            ["boat, water, bird", "commongen", "constraint"],
+            ["What flows under a bridge?", "commonsense_qa", "constraint"]]
+bart = RelationsInference(
+    model_path='MrVicente/commonsense_bart_commongen',
+    kg_type=KGType.CONCEPTNET,
+    model_type=Model_Type.RELATIONS,
+    max_length=32
+)
+#############################
+#   Helper
+#############################
+def infer_bart(context, task_type, decoding_type_str):
+    response, encoder_attentions, model_input = bart.generate_based_on_context(context, use_kg=False)
+    return response[0]
+def plot_attention(layer, head):
+    fig = plt.figure()
+    plt.plot([1, 2, 3], [2, 4, 6])
+    plt.title("Things")
+    plt.ylabel("Cases")
+    plt.xlabel("Days since Day 0")
+    return fig
+#############################
+#   Interface
+#############################
+app = gr.Blocks()
+with app:
+    gr.Markdown(
+        """
+        # Demo
+        ### Test Commonsense Relation-Aware BART (BART-RA) model
+        Tutorial: <br>
+            1) Select the possible model variations and tasks;<br>
+            2) Change the inputs and Click the buttons to produce results;<br>
+            3) See attention visualisations, by choosing a specific layer and head;<br>
+        """)
+    with gr.Row():
+        context_input = gr.Textbox(lines=2, value="What's the meaning of life?", label='Input:')
+        model_result_output = gr.Textbox(lines=2, label='Model result:')
+    with gr.Column():
+        task_type_choice = gr.Radio(
+            ["eli5", "commongen"], value="eli5", label="What task do you want to try?"
+        )
+        decoding_type_choice = gr.Radio(
+            ["default", "constraint"], value="default", label="What decoding strategy do you want to use?"
+        )
+    with gr.Row():
+        model_btn = gr.Button(value="See Model Results")
+    gr.Markdown(
+        """
+        ---
+        Observe Attention
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            layer = gr.Slider(0, 11, 0, step=1, label="Layer")
+            head = gr.Slider(0, 15, 0, step=1, label="Head")
+        with gr.Column():
+            plot_output = gr.Plot()
+    with gr.Row():
+        vis_btn = gr.Button(value="See Attention Scores")
+    model_btn.click(fn=infer_bart, inputs=[context_input, task_type_choice, decoding_type_choice],
+                    outputs=[model_result_output])
+    vis_btn.click(fn=plot_attention, inputs=[layer, head], outputs=[plot_output])
+if __name__ == '__main__':
+    app.launch()

custom_bart/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from .bart_attention import BartCustomAttention
+from .bart_mask_attention import BartCustomMaskAttention
+from .bart_for_conditional_generation import BartCustomForConditionalGeneration
+from .bart_model import BartCustomModel
+from .config import BartCustomConfig
+from .custom_constants import BartConstants
+from .decoder import *
+from .decoder_layer import *
+from .encoder import *
+from .encoder_layer import *
+from .bart_generation_mixin import *
+from . import *

custom_bart/attention_utils.py ADDED Viewed

	@@ -0,0 +1,132 @@

+#############################
+#   Imports
+#############################
+# Python modules
+# Remote modules
+import torch
+# Local modules
+#############################
+#   Constants
+#############################
+#############################
+#   Stuff
+#############################
+def find_head_to_mask(heads_mask) -> int:
+    head_idx = torch.argmax(heads_mask)
+    head_idx_simple = head_idx.item()
+    return head_idx_simple
+def commonsense_attention_mask_update(bsz, n_tokens, commonsense_matrix, attn_weights,
+                                      num_heads=16, specific_head=0):
+    commonsense_mask = torch.zeros(
+        ((bsz, num_heads, n_tokens, n_tokens))
+    )
+    attn_weights_helper = attn_weights.reshape((num_heads, bsz, n_tokens, n_tokens))
+    zeros = torch.zeros(
+        ((bsz, n_tokens, n_tokens))
+    )
+    head_previous_attention_weights = attn_weights_helper[specific_head]
+    attn_weights_helper[specific_head] = zeros
+    attn_weights_helper = attn_weights_helper.reshape((bsz, num_heads, n_tokens, n_tokens))
+    if commonsense_matrix is None:
+        # ignore is not passed (ones -> neutral since multiplication is used)
+        commonsense_matrix = torch.ones(
+            ((bsz, n_tokens, n_tokens))
+        )
+    commonsense_mask = commonsense_mask.reshape((num_heads, bsz, n_tokens, n_tokens))
+    commonsense_mask[specific_head] = head_previous_attention_weights * commonsense_matrix
+    # TODO Stupid conversion
+    commonsense_mask = commonsense_mask.reshape((bsz, num_heads, n_tokens, n_tokens)).to('cuda')
+    return attn_weights_helper + commonsense_mask
+def convert_relations_to_binary_mask(input_relations, should_clone=True):
+    relations_binary_mask=input_relations
+    if should_clone:
+        relations_binary_mask = input_relations.clone()
+    relations_binary_mask[relations_binary_mask > 1] = 1
+    return relations_binary_mask
+def relation_binary_2d_to_1d(relations_binary_mask):
+    relations_binary_mask = relations_binary_mask.sum(dim=1)
+    relations_binary_mask[relations_binary_mask > 1] = 1
+    return relations_binary_mask
+def create_layer_with_commonsense_on_specific_head(relation_binary_mask, bsz, num_heads, specific_head=0):
+    n_tokens = relation_binary_mask.size()[-1]
+    relations_mask = torch.zeros(
+        (bsz, num_heads, n_tokens, n_tokens)
+    )
+    layer = relations_mask.reshape((num_heads, bsz, n_tokens, n_tokens))
+    layer[specific_head] = relation_binary_mask
+    layer = layer.reshape((bsz, num_heads, n_tokens, n_tokens))
+    return layer
+def update_weights_regarding_relations_on_specific_head(layer_head_mask, attn_weights, relation_inputs, bsz, num_heads, tgt_len, src_len, verbose=True):
+    #layer_head_mask = layer_head_mask.to(attn_weights.device)
+    inverse_layer_head_mask = (layer_head_mask.view(num_heads, 1, 1) - 1) * -1
+    #inverse_layer_head_mask = inverse_layer_head_mask.to(attn_weights.device)
+    #print('layer_head_mask:', layer_head_mask)
+    if verbose:
+        print("==============================")
+        print('layer_head_mask.shape:',  layer_head_mask.shape)
+        print('inverse_layer_head_mask.shape:',  inverse_layer_head_mask.shape)
+        print('attn_weights.shape:',  attn_weights.shape)
+        print('relation_inputs.shape', relation_inputs.shape)
+        print("==============================")
+    #print('layer_head_mask.device:', layer_head_mask.device)
+    #print('inverse_layer_head_mask.device:', inverse_layer_head_mask.device)
+    #print('relation_inputs.device:', relation_inputs.device)
+    intermediate_weights = inverse_layer_head_mask * attn_weights.view(bsz, num_heads, tgt_len, src_len)
+    relation_inputs = convert_relations_to_binary_mask(relation_inputs, should_clone=False)
+    relation_weights = layer_head_mask.view(num_heads, 1, 1) * relation_inputs.view(bsz,1,tgt_len, src_len) * attn_weights.view(bsz, num_heads,
+                                                                                               tgt_len, src_len)
+    attn_weights = intermediate_weights + relation_weights
+    # [batch, n_heads, seq_length, seq_length]
+    if verbose:
+        print('attn_weights_int.shape', attn_weights.shape)
+    return attn_weights
+"""
+    def create_commonsense_mask(self, bsz, n_tokens, commonsense_matrix, num_heads=16, specific_head=0):
+        commonsense_mask = torch.zeros(
+            ((bsz, num_heads, n_tokens, n_tokens))
+        )
+        if commonsense_matrix is None:
+            commonsense_matrix = torch.zeros(
+                ((bsz, n_tokens, n_tokens))
+            )
+        commonsense_mask = commonsense_mask.reshape((num_heads, bsz, n_tokens, n_tokens))
+        commonsense_mask[specific_head] = commonsense_matrix
+        commonsense_mask = commonsense_mask.reshape((bsz, num_heads, n_tokens, n_tokens))
+        return commonsense_mask
+    def commonsense_attention_mask_update(self, bsz, n_tokens, commonsense_matrix, attn_weights,
+                                          specific_head=0):
+        num_heads = self.num_heads
+        commonsense_mask = torch.zeros(
+            ((bsz, num_heads, n_tokens, n_tokens))
+        )
+        attn_weights_helper = attn_weights.reshape((num_heads, bsz, n_tokens, n_tokens))
+        zeros = torch.zeros(
+            ((bsz, n_tokens, n_tokens))
+        )
+        head_previous_attention_weights = attn_weights_helper[specific_head]
+        attn_weights_helper[specific_head] = zeros
+        attn_weights_helper = attn_weights_helper.reshape((bsz, num_heads, n_tokens, n_tokens))
+        if commonsense_matrix is None:
+            # ignore is not passed (ones -> neutral since multiplication is used)
+            commonsense_matrix = torch.ones(
+                ((bsz, n_tokens, n_tokens))
+            )
+        commonsense_mask = commonsense_mask.reshape((num_heads, bsz, n_tokens, n_tokens))
+        commonsense_mask[specific_head] = head_previous_attention_weights * commonsense_matrix
+        # TODO Stupid conversion
+        commonsense_mask = commonsense_mask.reshape((bsz, num_heads, n_tokens, n_tokens)).to('cuda')
+        return attn_weights_helper + commonsense_mask
+"""

custom_bart/bart_attention.py ADDED Viewed

	@@ -0,0 +1,313 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from typing import Optional, Tuple
+# Remote modules
+import torch
+from torch import nn
+# Local modules
+from .attention_utils import (
+    create_layer_with_commonsense_on_specific_head,
+    find_head_to_mask,
+    convert_relations_to_binary_mask,
+    update_weights_regarding_relations_on_specific_head
+)
+class BartCustomAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        num_relation_kinds: int = 0,
+        use_same_relation_kv_emb: bool = True,
+        heads_mask:  Optional[torch.Tensor] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        if heads_mask.size() != (self.num_heads,):
+            raise ValueError(
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {heads_mask.size()}"
+            )
+        self.heads_mask = heads_mask
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.num_relation_kinds = num_relation_kinds
+        self.relation_k_emb = nn.Embedding(num_relation_kinds + 1, self.head_dim, padding_idx=0)
+        if use_same_relation_kv_emb:
+            self.relation_v_emb = self.relation_k_emb
+        else:
+            self.relation_v_emb = nn.Embedding(num_relation_kinds + 1, self.head_dim, padding_idx=0)
+        self.k_rel_scale = 0.0
+        self.v_rel_scale = 1.0
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        relation_inputs: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        #print('device:', hidden_states.device)
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        #print(relation_inputs.shape, 'VS ', (bsz, tgt_len, tgt_len))
+        if relation_inputs is None:
+            # TODO
+            print('oh no')
+            relation_inputs = torch.zeros((bsz, tgt_len, tgt_len)).to('cuda').long()
+        print(relation_inputs.shape, ' | ', (bsz, tgt_len, tgt_len))
+        assert relation_inputs.shape == (bsz, tgt_len, tgt_len)
+        # (batch_size, seq_length, seq_length, self.num_relation_kinds, self.inner_dim // num_relation_kinds)
+        relation_k_embeds = self.relation_k_emb(relation_inputs)
+        relation_v_embeds = self.relation_v_emb(relation_inputs)
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz)
+        src_len = key_states.size(2)
+        # compute scores
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(3, 2)
+        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+        # q_t is [batch, seq_length, n_heads, dim_per_head]
+        q_t = query_states.permute(0, 2, 1, 3)
+        #print('qt.shape: ', q_t.shape)
+        # r_t is [batch, seq_length, dim_per_head, seq_length]
+        r_t = relation_k_embeds.transpose(-2, -1)
+        #print('rt.shape: ',  r_t.shape)
+        q_tr_t_matmul = torch.matmul(q_t, r_t)  # [batch, seq_length, n_heads, seq_length]
+        q_tr_tmatmul_t = q_tr_t_matmul.permute(0, 2, 1, 3)  # [batch, n_heads, seq_length, seq_length]
+        # Make sure impact of relation-aware only apllicable on specific heads (k-part)
+        #print("==========")
+        #print('first K: ', q_tr_tmatmul_t.sum())
+        """
+        q_tr_tmatmul_t = self.layer_heads_relation_attention_update(
+            self.heads_mask,
+            q_tr_tmatmul_t,
+        )
+        """
+        #print('second K: ', q_tr_tmatmul_t.sum())
+        #print("==========")
+        # give weight to influence
+        #q_tr_tmatmul_t = 100.0 * q_tr_tmatmul_t
+        # Add to scores
+        #print('attn_weights k [before]', attn_weights)
+        #print('attn_weights sum k [before]', attn_weights.sum())
+        attn_weights += self.k_rel_scale * q_tr_tmatmul_t
+        #attn_weights += 100.0 * q_tr_tmatmul_t
+        #print('attn_weights k [after]: ', attn_weights)
+        #print('attn_weights sum k [after]', attn_weights.sum())
+        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        # Wrong place... gonna comment
+        """
+        attn_weights = self.layer_heads_relation_attention_update(layer_head_mask,
+                                              relation_inputs,
+                                              attn_weights,
+                                              bsz,
+                                              tgt_len,
+                                              src_len)
+        """
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states.view(*proj_shape))
+        #print('attn_probs.shape', attn_probs.shape)
+        # w_t is [batch, seq_length, n_heads, seq_length]
+        w_t = attn_probs.view(bsz, self.num_heads, tgt_len, src_len).permute(0, 2, 1, 3)
+        #print('w_t.shape 1:', w_t.shape)
+        #print('relation_v_embeds.shape', relation_v_embeds.shape)
+        # [batch, seq_length, n_heads, seq_length]
+        w_tr_matmul = torch.matmul(w_t, relation_v_embeds)
+        #print('w_tr_matmul.shape 1:', w_tr_matmul.shape)
+        #print('w_tr_matmul.shape 2:', w_tr_matmul.shape)
+        # Make sure impact of relation-aware only apllicable on specific heads (v-part)
+        #print("==========")
+        #print('first V sum: ', w_tr_matmul.sum())
+        #print('first V: ', w_tr_matmul[0])
+        """
+        w_tr_matmul = self.layer_heads_relation_attention_v_update(
+            self.heads_mask,
+            w_tr_matmul,
+            bsz,
+            tgt_len,
+        )
+        """
+        w_tr_matmul = self.v_rel_scale * w_tr_matmul
+        #print('second V sum: ', w_tr_matmul.sum())
+        #print('second V: ', w_tr_matmul[0])
+        #print("==========")
+        w_tr_matmul = w_tr_matmul.permute(0, 2, 1, 3)
+        w_tr_matmul = w_tr_matmul.reshape(bsz * self.num_heads, tgt_len, self.head_dim)
+        #print('attn_output v [before]', attn_output)
+        #print('attn_output sum v [before]', attn_output.sum())
+        attn_output += w_tr_matmul
+        #attn_output += 100.0 * w_tr_matmul
+        #print('attn_output v [after]', attn_output)
+        #print('attn_output sum v [after]', attn_output.sum())
+        #raise Exception()
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped, past_key_value
+    def layer_heads_relation_attention_update(self,
+                                              layer_head_mask,
+                                              data,
+                                              ):
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+                )
+            #print('layer_head_mask:', layer_head_mask)
+            masked_weights = layer_head_mask.view(self.num_heads, 1, 1) * data
+            return masked_weights
+        return data
+    def layer_heads_relation_attention_v_update(self,
+                                              layer_head_mask,
+                                              data,
+                                              bsz,
+                                              tgt_len,
+                                              ):
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+                )
+            #relation_binary_mask = convert_relations_to_binary_mask(relation_inputs)
+            #one_dimension_mask = relation_binary_mask.sum(-1)
+            #relation_binary_mask = convert_relations_to_binary_mask(one_dimension_mask)
+            # [16, 128, 16, 64]
+            masked_weights = layer_head_mask.view(self.num_heads, 1, 1) * data.view(bsz, self.num_heads, tgt_len, self.head_dim)
+            return masked_weights.view(bsz, tgt_len, self.num_heads, self.head_dim)
+        return data

custom_bart/bart_for_conditional_generation.py ADDED Viewed

	@@ -0,0 +1,205 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from typing import (
+    Optional,
+    Tuple,
+    Union,
+    List,
+)
+# Remote modules
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers import (
+    BartConfig,
+    BartPretrainedModel,
+)
+from transformers.modeling_outputs import Seq2SeqLMOutput
+from transformers.models.bart.modeling_bart import shift_tokens_right
+from transformers.utils import (
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .bart_model import BartCustomModel
+from .config import BartCustomConfig
+from .custom_constants import BartConstants
+from .bart_generation_mixin import GenerationMixin
+from .custom_outputs import CustomSeq2SeqLMOutput
+logger = logging.get_logger(__name__)
+@add_start_docstrings(
+    "The BART Model with a language modeling head. Can be used for summarization.", BartConstants.BART_START_DOCSTRING
+)
+class BartCustomForConditionalGeneration(BartPretrainedModel, GenerationMixin):
+    base_model_prefix = "model"
+    _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head\.weight"]
+    def __init__(self, config: BartCustomConfig):
+        super().__init__(config)
+        self.model = BartCustomModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_encoder(self):
+        return self.model.get_encoder()
+    def get_decoder(self):
+        return self.model.get_decoder()
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        self._resize_final_logits_bias(new_num_tokens)
+        return new_embeddings
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    @add_start_docstrings_to_model_forward(BartConstants.BART_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=BartConstants.CONFIG_FOR_DOC)
+    @add_end_docstrings(BartConstants.BART_GENERATION_EXAMPLE)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        input_commonsense_relations: Optional[torch.Tensor] = None,
+        reduce_ce=True,
+    ) -> Union[Tuple, CustomSeq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            relation_inputs=input_commonsense_relations
+        )
+        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(reduce=reduce_ce, ignore_index=self.config.pad_token_id) # added ignore_index=self.config.pad_token_id
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return CustomSeq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            head_mask=outputs.encoder_head_mask
+        )
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past

custom_bart/bart_generation_mixin.py ADDED Viewed

The diff for this file is too large to render. See raw diff

custom_bart/bart_mask_attention.py ADDED Viewed

	@@ -0,0 +1,238 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from typing import Optional, Tuple
+# Remote modules
+import torch
+from torch import nn
+# Local modules
+from .attention_utils import update_weights_regarding_relations_on_specific_head
+class BartCustomMaskAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        num_relation_kinds: int = 0,
+        heads_mask: Optional[torch.Tensor] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        if heads_mask.size() != (self.num_heads,):
+            raise ValueError(
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {heads_mask.size()}"
+            )
+        self.heads_mask = heads_mask
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.num_relation_kinds = num_relation_kinds
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        relation_inputs: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        #print(relation_inputs.shape, 'VS ', (bsz, tgt_len, tgt_len))
+        if relation_inputs is None:
+            # TODO
+            relation_inputs = torch.zeros((bsz, tgt_len, tgt_len)).to('cuda').long()
+        assert relation_inputs.shape == (bsz, tgt_len, tgt_len)
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if self.heads_mask is not None:# and layer_head_mask is not None:
+            if self.heads_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+                )
+            h_mask = layer_head_mask
+            #print('h_mask: ', h_mask)
+            if layer_head_mask is None:
+                h_mask = self.heads_mask
+            #h_mask.to(attn_weights.device)
+            attn_weights = update_weights_regarding_relations_on_specific_head(h_mask, attn_weights,
+                                                                               relation_inputs, bsz, self.num_heads, tgt_len,
+                                                                               src_len, verbose=False)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        elif layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped, past_key_value
+    def find_head_to_mask(self, heads_mask) -> int:
+        head_idx = torch.argmax(heads_mask)
+        head_idx_simple = head_idx.item()
+        return head_idx_simple
+    def create_commonsense_mask(self, bsz, n_tokens, commonsense_matrix, num_heads=16, specific_head=0):
+        commonsense_mask = torch.zeros(
+            ((bsz, num_heads, n_tokens, n_tokens))
+        )
+        if commonsense_matrix is None:
+            commonsense_matrix = torch.zeros(
+                ((bsz, n_tokens, n_tokens))
+            )
+        commonsense_mask = commonsense_mask.reshape((num_heads, bsz, n_tokens, n_tokens))
+        commonsense_mask[specific_head] = commonsense_matrix
+        commonsense_mask = commonsense_mask.reshape((bsz, num_heads, n_tokens, n_tokens))
+        return commonsense_mask
+    def commonsense_attention_mask_update(self, bsz, n_tokens, commonsense_matrix, attn_weights,
+                                          specific_head=0):
+        num_heads = self.num_heads
+        commonsense_mask = torch.zeros(
+            ((bsz, num_heads, n_tokens, n_tokens))
+        )
+        attn_weights_helper = attn_weights.reshape((num_heads, bsz, n_tokens, n_tokens))
+        zeros = torch.zeros(
+            ((bsz, n_tokens, n_tokens))
+        )
+        head_previous_attention_weights = attn_weights_helper[specific_head]
+        attn_weights_helper[specific_head] = zeros
+        attn_weights_helper = attn_weights_helper.reshape((bsz, num_heads, n_tokens, n_tokens))
+        if commonsense_matrix is None:
+            # ignore is not passed (ones -> neutral since multiplication is used)
+            commonsense_matrix = torch.ones(
+                ((bsz, n_tokens, n_tokens))
+            )
+        commonsense_mask = commonsense_mask.reshape((num_heads, bsz, n_tokens, n_tokens))
+        commonsense_mask[specific_head] = head_previous_attention_weights * commonsense_matrix
+        # TODO Stupid conversion
+        commonsense_mask = commonsense_mask.reshape((bsz, num_heads, n_tokens, n_tokens)).to('cuda')
+        return attn_weights_helper + commonsense_mask
+    def convert_relations_to_binary_mask(self, input_relations):
+        relations_binary_mask = input_relations.clone()
+        relations_binary_mask[relations_binary_mask > 1] = 1
+        return relations_binary_mask

custom_bart/bart_model.py ADDED Viewed

	@@ -0,0 +1,169 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from typing import (
+    Optional,
+    Tuple,
+    Union,
+    List,
+)
+# Remote modules
+import torch
+from torch import nn
+from transformers import (
+    BartConfig,
+    BartPretrainedModel,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutput, Seq2SeqModelOutput,
+)
+from transformers.models.bart.modeling_bart import shift_tokens_right
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+# Local modules
+from .config import BartCustomConfig
+from .encoder import BartCustomEncoder
+from .decoder import BartCustomDecoder
+from .custom_constants import BartConstants
+from .custom_outputs import CustomSeq2SeqModelOutput
+@add_start_docstrings(
+    "The bare BART Model outputting raw hidden-states without any specific head on top.",
+    BartConstants.BART_START_DOCSTRING,
+)
+class BartCustomModel(BartPretrainedModel):
+    def __init__(self, config: BartCustomConfig):
+        super().__init__(config)
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+        self.encoder = BartCustomEncoder(config, self.shared)
+        self.decoder = BartCustomDecoder(config, self.shared)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.shared
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+    def get_encoder(self):
+        return self.encoder
+    def get_decoder(self):
+        return self.decoder
+    @add_start_docstrings_to_model_forward(BartConstants.BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class= BartConstants.TOKENIZER_FOR_DOC,
+        checkpoint= BartConstants.CHECKPOINT_FOR_DOC,
+        output_type= Seq2SeqModelOutput,
+        config_class= BartConstants.CONFIG_FOR_DOC,
+        expected_output= BartConstants.EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        relation_inputs: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CustomSeq2SeqModelOutput]:
+        # different to other models, Bart automatically creates decoder_input_ids from
+        # input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = shift_tokens_right(
+                input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                relation_inputs=relation_inputs
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return CustomSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            encoder_head_mask=head_mask
+        )

custom_bart/bart_onnx.py ADDED Viewed

	@@ -0,0 +1,240 @@

+from collections import OrderedDict
+from typing import Any, Mapping, Optional
+import torch
+from transformers import PreTrainedTokenizer
+from transformers.onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
+from transformers.onnx.utils import compute_effective_axis_dimension
+from transformers.utils.generic import TensorType
+from transformers.utils.import_utils import is_torch_available
+class BartCustumOnnxConfig(OnnxSeq2SeqConfigWithPast):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task in ["default", "seq2seq-lm"]:
+            common_inputs = OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                    ("input_commonsense_relations", {0: "batch", 1: "encoder_sequence", 2: "encoder_sequence"}),
+                ]
+            )
+            if self.use_past:
+                common_inputs["decoder_input_ids"] = {0: "batch"}
+                common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
+            else:
+                common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
+                common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
+            if self.use_past:
+                self.fill_with_past_key_values_(common_inputs, direction="inputs")
+        elif self.task == "causal-lm":
+            # TODO: figure this case out.
+            common_inputs = OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                ]
+            )
+            if self.use_past:
+                num_encoder_layers, _ = self.num_layers
+                for i in range(num_encoder_layers):
+                    common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
+                    common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
+        else:
+            common_inputs = OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                    ("input_commonsense_relations", {0: "batch", 2: "encoder_sequence", 3: "encoder_sequence"}),
+                    ("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}),
+                    ("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}),
+                ]
+            )
+        return common_inputs
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task in ["default", "seq2seq-lm"]:
+            common_outputs = super().outputs
+        else:
+            common_outputs = super(OnnxConfigWithPast, self).outputs
+            if self.use_past:
+                num_encoder_layers, _ = self.num_layers
+                for i in range(num_encoder_layers):
+                    common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
+                    common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
+        return common_outputs
+    def _generate_dummy_inputs_for_default_and_seq2seq_lm(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, seq_length, is_pair, framework
+        )
+        # Generate decoder inputs
+        decoder_seq_length = seq_length if not self.use_past else 1
+        decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, decoder_seq_length, is_pair, framework
+        )
+        decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
+        common_inputs = dict(**encoder_inputs, **decoder_inputs)
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+            batch, encoder_seq_length = common_inputs["input_ids"].shape
+            decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
+            num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
+            encoder_shape = (
+                batch,
+                num_encoder_attention_heads,
+                encoder_seq_length,
+                self._config.hidden_size // num_encoder_attention_heads,
+            )
+            decoder_past_length = decoder_seq_length + 3
+            decoder_shape = (
+                batch,
+                num_decoder_attention_heads,
+                decoder_past_length,
+                self._config.hidden_size // num_decoder_attention_heads,
+            )
+            common_inputs["decoder_attention_mask"] = torch.cat(
+                [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
+            )
+            common_inputs["past_key_values"] = []
+            # If the number of encoder and decoder layers are present in the model configuration, both are considered
+            num_encoder_layers, num_decoder_layers = self.num_layers
+            min_num_layers = min(num_encoder_layers, num_decoder_layers)
+            max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers
+            remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
+            for _ in range(min_num_layers):
+                common_inputs["past_key_values"].append(
+                    (
+                        torch.zeros(decoder_shape),
+                        torch.zeros(decoder_shape),
+                        torch.zeros(encoder_shape),
+                        torch.zeros(encoder_shape),
+                    )
+                )
+            # TODO: test this.
+            shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
+            for _ in range(min_num_layers, max_num_layers):
+                common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
+        return common_inputs
+    def _generate_dummy_inputs_for_causal_lm(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, seq_length, is_pair, framework
+        )
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+            batch, seqlen = common_inputs["input_ids"].shape
+            # Not using the same length for past_key_values
+            past_key_values_length = seqlen + 2
+            num_encoder_layers, _ = self.num_layers
+            num_encoder_attention_heads, _ = self.num_attention_heads
+            past_shape = (
+                batch,
+                num_encoder_attention_heads,
+                past_key_values_length,
+                self._config.hidden_size // num_encoder_attention_heads,
+            )
+            mask_dtype = common_inputs["attention_mask"].dtype
+            common_inputs["attention_mask"] = torch.cat(
+                [common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
+            )
+            common_inputs["past_key_values"] = [
+                (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_encoder_layers)
+            ]
+        return common_inputs
+    def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        # Copied from OnnxConfig.generate_dummy_inputs
+        # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity.
+        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+        batch_size = compute_effective_axis_dimension(
+            batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
+        )
+        # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
+        token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
+        seq_length = compute_effective_axis_dimension(
+            seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
+        )
+        # Generate dummy inputs according to compute batch and sequence
+        dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
+        tmp_seq_length = seq_length + 2
+        commonsense_relation= torch.IntTensor([[[0] * tmp_seq_length] * tmp_seq_length]* batch_size)
+        common_inputs = dict(tokenizer(dummy_input,
+                            return_tensors=framework))
+        common_inputs['input_commonsense_relations'] = commonsense_relation
+        print('here:', common_inputs)
+        return common_inputs
+    def generate_dummy_inputs(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        if self.task in ["default", "seq2seq-lm"]:
+            common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm(
+                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
+            )
+        elif self.task == "causal-lm":
+            common_inputs = self._generate_dummy_inputs_for_causal_lm(
+                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
+            )
+        else:
+            common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
+            )
+        if 'decoder_input_commonsense_relations' in common_inputs:
+            del common_inputs['decoder_input_commonsense_relations']
+        return common_inputs
+    def _flatten_past_key_values_(self, flattened_output, name, idx, t):
+        if self.task in ["default", "seq2seq-lm"]:
+            flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t)
+        else:
+            flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_(
+                flattened_output, name, idx, t
+            )

custom_bart/config.py ADDED Viewed

	@@ -0,0 +1,197 @@

+from transformers import BartConfig
+class BartCustomConfig(BartConfig):
+    def __init__(
+        self,
+        model_type='bart',
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.1,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        classif_dropout=0.1,
+        scale_embedding=False,
+        use_cache=True,
+        num_labels=3,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        is_encoder_decoder=True,
+        decoder_start_token_id=2,
+        forced_eos_token_id=2,
+        forced_bos_token_id=0,
+        no_repeat_ngram_size=3,  # adding
+        num_hidden_layers=12,
+        normalize_before=False,
+        num_beams=4,
+        add_bias_logits=False,
+        add_final_layer_norm=False,
+        early_stopping=True,
+        gradient_checkpointing=False,
+        num_relation_kinds = 0,
+        use_same_relation_kv_emb = True,
+        is_simple_mask_commonsense = False,
+        should_embed_positions = False,
+        heads_mask = None,
+        **kwargs
+    ):
+        super(BartCustomConfig, self).__init__(
+        model_type=model_type,
+        vocab_size=vocab_size,
+        max_position_embeddings=max_position_embeddings,
+        encoder_layers=encoder_layers,
+        encoder_ffn_dim=encoder_ffn_dim,
+        encoder_attention_heads=encoder_attention_heads,
+        decoder_layers=decoder_layers,
+        decoder_ffn_dim=decoder_ffn_dim,
+        decoder_attention_heads=decoder_attention_heads,
+        encoder_layerdrop=encoder_layerdrop,
+        decoder_layerdrop=decoder_layerdrop,
+        activation_function=activation_function,
+        d_model=d_model,
+        dropout=dropout,
+        attention_dropout=attention_dropout,
+        activation_dropout=activation_dropout,
+        init_std=init_std,
+        classifier_dropout=classifier_dropout,
+        classif_dropout=classif_dropout,
+        scale_embedding=scale_embedding,
+        use_cache=use_cache,
+        num_labels=num_labels,
+        pad_token_id = pad_token_id,
+        bos_token_id = bos_token_id,
+        eos_token_id = eos_token_id,
+        is_encoder_decoder = is_encoder_decoder,
+        decoder_start_token_id = decoder_start_token_id,
+        forced_eos_token_id = forced_eos_token_id,
+        forced_bos_token_id=forced_bos_token_id,
+        no_repeat_ngram_size=no_repeat_ngram_size,  # Adding
+        normalize_before=normalize_before,
+        num_hidden_layers=num_hidden_layers,
+        num_beams=num_beams,
+        add_bias_logits=add_bias_logits,
+        add_final_layer_norm=add_final_layer_norm,
+        early_stopping=early_stopping,
+        gradient_checkpointing=gradient_checkpointing,
+        num_relation_kinds = num_relation_kinds,
+        use_same_relation_kv_emb = use_same_relation_kv_emb,
+        is_simple_mask_commonsense = is_simple_mask_commonsense,
+        heads_mask = None,
+        should_embed_positions=False,
+        **kwargs
+        )
+        self.num_relation_kinds = num_relation_kinds
+        self.use_same_relation_kv_emb = use_same_relation_kv_emb
+        self.is_simple_mask_commonsense = is_simple_mask_commonsense
+        self.heads_mask = heads_mask
+        self.should_embed_positions = should_embed_positions
+class BartSmallCustomConfig(BartConfig):
+    def __init__(
+        self,
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=6,
+        encoder_ffn_dim=3072,
+        encoder_attention_heads=12,
+        decoder_layers=12,
+        decoder_ffn_dim=3072,
+        decoder_attention_heads=12,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=768,
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.1,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        classif_dropout= 0.1,
+        scale_embedding=False,
+        use_cache=True,
+        num_labels=3,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        is_encoder_decoder=True,
+        decoder_start_token_id=2,
+        forced_eos_token_id=2,
+        forced_bos_token_id=0,
+        no_repeat_ngram_size=3, #adding
+        num_hidden_layers=6,
+        normalize_before=False,
+        num_beams=4,
+        add_bias_logits=False,
+        add_final_layer_norm=False,
+        _name_or_path="bart-base",
+        early_stopping=True,
+        gradient_checkpointing=False,
+        num_relation_kinds = 0,
+        use_same_relation_kv_emb = True,
+        is_simple_mask_commonsense = False,
+        should_embed_positions = True,
+        heads_mask = None,
+        **kwargs
+    ):
+        super(BartSmallCustomConfig, self).__init__(
+        vocab_size=vocab_size,
+        max_position_embeddings=max_position_embeddings,
+        encoder_layers=encoder_layers,
+        encoder_ffn_dim=encoder_ffn_dim,
+        encoder_attention_heads=encoder_attention_heads,
+        decoder_layers=decoder_layers,
+        decoder_ffn_dim=decoder_ffn_dim,
+        decoder_attention_heads=decoder_attention_heads,
+        encoder_layerdrop=encoder_layerdrop,
+        decoder_layerdrop=decoder_layerdrop,
+        activation_function=activation_function,
+        d_model=d_model,
+        dropout=dropout,
+        attention_dropout=attention_dropout,
+        activation_dropout=activation_dropout,
+        init_std=init_std,
+        classifier_dropout=classifier_dropout,
+        classif_dropout=classif_dropout,
+        scale_embedding=scale_embedding,
+        use_cache=use_cache,
+        num_labels=num_labels,
+        pad_token_id = pad_token_id,
+        bos_token_id = bos_token_id,
+        eos_token_id = eos_token_id,
+        is_encoder_decoder = is_encoder_decoder,
+        decoder_start_token_id = decoder_start_token_id,
+        forced_eos_token_id = forced_eos_token_id,
+        forced_bos_token_id=forced_bos_token_id,
+        no_repeat_ngram_size = no_repeat_ngram_size, #Adding
+        normalize_before = normalize_before,
+        num_hidden_layers=num_hidden_layers,
+        num_beams=num_beams,
+        add_bias_logits=add_bias_logits,
+        add_final_layer_norm=add_final_layer_norm,
+        _name_or_path=_name_or_path,
+        early_stopping=early_stopping,
+        gradient_checkpointing=gradient_checkpointing,
+        num_relation_kinds = num_relation_kinds,
+        use_same_relation_kv_emb = use_same_relation_kv_emb,
+        is_simple_mask_commonsense = is_simple_mask_commonsense,
+        heads_mask = heads_mask,
+        should_embed_positions=should_embed_positions,
+        **kwargs
+        )
+        self.num_relation_kinds = num_relation_kinds
+        self.use_same_relation_kv_emb = use_same_relation_kv_emb
+        self.is_simple_mask_commonsense = is_simple_mask_commonsense
+        self.heads_mask = heads_mask
+        self.should_embed_positions = should_embed_positions

custom_bart/custom_constants.py ADDED Viewed

	@@ -0,0 +1,168 @@

+class BartConstants:
+    CHECKPOINT_FOR_DOC = "facebook/bart-base"
+    CONFIG_FOR_DOC = "BartConfig"
+    TOKENIZER_FOR_DOC = "BartTokenizer"
+    # Base model docstring
+    EXPECTED_OUTPUT_SHAPE = [1, 8, 768]
+    BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
+        "facebook/bart-large",
+    ]
+    BART_START_DOCSTRING = r"""
+        This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+        library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+        etc.)
+        This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+        Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+        and behavior.
+        Parameters:
+            config ([`BartConfig`]):
+                Model configuration class with all the parameters of the model. Initializing with a config file does not
+                load the weights associated with the model, only the configuration. Check out the
+                [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+    """
+    BART_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+            Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+            For translation and summarization training, `decoder_input_ids` should be provided. If no
+            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
+            for denoising pre-training following the paper.
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+            If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_inputs`] and
+            modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information
+            on the default strategy.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape
+            `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you
+            can choose to directly pass an embedded representation. This is useful if you want more control over how to
+            convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+    BART_GENERATION_EXAMPLE = r"""
+        Summarization example:
+        ```python
+        >>> from transformers import BartTokenizer, BartForConditionalGeneration
+        >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
+        >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        >>> ARTICLE_TO_SUMMARIZE = (
+        ...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
+        ...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
+        ...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
+        ... )
+        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")
+        >>> # Generate Summary
+        >>> summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20)
+        >>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions'
+        ```
+        Mask filling example:
+        ```python
+        >>> from transformers import BartTokenizer, BartForConditionalGeneration
+        >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
+        >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
+        >>> TXT = "My friends are <mask> but they eat too many carbs."
+        >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
+        >>> logits = model(input_ids).logits
+        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+        >>> probs = logits[0, masked_index].softmax(dim=0)
+        >>> values, predictions = probs.topk(5)
+        >>> tokenizer.decode(predictions).split()
+        ['not', 'good', 'healthy', 'great', 'very']
+        ```
+    """

custom_bart/custom_outputs.py ADDED Viewed

	@@ -0,0 +1,142 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from dataclasses import dataclass
+from typing import Optional, Tuple
+# Remote modules
+import torch
+from transformers.modeling_outputs import ModelOutput
+# Local modules
+#############################
+#   Constants
+#############################
+#############################
+#   Stuff
+#############################
+@dataclass
+class CustomSeq2SeqLMOutput(ModelOutput):
+    """
+    Base class for sequence-to-sequence language models outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    head_mask: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class CustomSeq2SeqModelOutput(ModelOutput):
+    """
+    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
+    decoding.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_head_mask: Optional[Tuple[torch.FloatTensor]] = None

custom_bart/decoder.py ADDED Viewed

	@@ -0,0 +1,312 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from typing import (
+    Optional,
+    Tuple,
+    Union,
+    List,
+)
+import math
+import random
+# Remote modules
+import torch
+from torch import nn
+from transformers import (
+    BartConfig,
+    BartPretrainedModel,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions
+)
+from transformers.models.bart.modeling_bart import (
+    BartLearnedPositionalEmbedding,
+    _expand_mask,
+    _make_causal_mask
+)
+from transformers.utils import (
+    logging,
+)
+# Local modules
+from .config import BartCustomConfig
+from .decoder_layer import BartCustomDecoderLayer
+logger = logging.get_logger(__name__)
+class BartCustomDecoder(BartPretrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BartDecoderLayer`]
+    Args:
+        config: BartConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+    def __init__(self, config: BartCustomConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+        )
+        self.layers = nn.ModuleList([BartCustomDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(self.device)
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
+                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )

custom_bart/decoder_layer.py ADDED Viewed

	@@ -0,0 +1,134 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from typing import Optional, Tuple
+# Remote modules
+import torch
+from torch import nn
+from transformers import BartConfig
+from transformers.activations import ACT2FN
+# Local modules
+from transformers.models.bart.modeling_bart import BartAttention
+from .config import BartCustomConfig
+class BartCustomDecoderLayer(nn.Module):
+    def __init__(self, config: BartCustomConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = BartAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = BartAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs

custom_bart/encoder.py ADDED Viewed

	@@ -0,0 +1,216 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from typing import (
+    Optional,
+    Tuple,
+    Union,
+)
+import math
+import random
+# Remote modules
+import torch
+from torch import nn
+from transformers import (
+    BartConfig,
+    BartPretrainedModel,
+)
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.models.bart.modeling_bart import (
+    BartLearnedPositionalEmbedding,
+    _expand_mask
+)
+# Local modules
+from .config import BartCustomConfig
+from .encoder_layer import BartCustomEncoderLayer
+class BartCustomEncoder(BartPretrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`BartEncoderLayer`].
+    Args:
+        config: BartConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+    def __init__(self, config: BartCustomConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+        if not config.should_embed_positions:
+            self.embed_positions = None
+        else:
+            self.embed_positions = BartLearnedPositionalEmbedding(
+                config.max_position_embeddings,
+                embed_dim,
+            )
+        device = self.device
+        self.layers = nn.ModuleList([BartCustomEncoderLayer(config, heads_mask=torch.Tensor(config.heads_mask[i]).to(device))
+                                     for i in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.run_config = config
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        relation_inputs: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        # Important for datasets which the order of words deoes not matter(eg: commongen)
+        if self.run_config.should_embed_positions:
+            embed_pos = self.embed_positions(input_shape)
+            hidden_states = inputs_embeds + embed_pos
+        else:
+            hidden_states = inputs_embeds
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                )
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions, relation_inputs=relation_inputs)
+                        return custom_forward
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                        relation_inputs=relation_inputs,
+                    )
+                hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )

custom_bart/encoder_layer.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from typing import Optional, Tuple
+# Remote modules
+import torch
+from torch import nn
+from transformers import BartConfig
+from transformers.activations import ACT2FN
+# Local modules
+from .bart_attention import BartCustomAttention
+from .bart_mask_attention import BartCustomMaskAttention
+from .config import BartCustomConfig
+class BartCustomEncoderLayer(nn.Module):
+    def __init__(self, config: BartCustomConfig, heads_mask: Optional[torch.Tensor]):
+        super().__init__()
+        self.embed_dim = config.d_model
+        is_simple_mask_commonsense = config.is_simple_mask_commonsense
+        if not is_simple_mask_commonsense:
+            print("Selecting complex relation attention")
+            self.self_attn = BartCustomAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                num_relation_kinds=config.num_relation_kinds,
+                use_same_relation_kv_emb=config.use_same_relation_kv_emb,
+                heads_mask=heads_mask,
+            )
+        else:
+            print("Selecting simple (MASK) relation attention")
+            self.self_attn = BartCustomMaskAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                num_relation_kinds=config.num_relation_kinds,
+                heads_mask=heads_mask,
+            )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        layer_head_mask: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+        relation_inputs: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            relation_inputs=relation_inputs,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs

custom_tokenizer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .bart_custom_tokenizer_fast import *

custom_tokenizer/bart_custom_tokenizer_fast.py ADDED Viewed

	@@ -0,0 +1,484 @@

+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from typing import List, Optional, Tuple, Dict
+from collections import deque
+import torch
+import numpy as np
+from tokenizers import pre_tokenizers, processors
+from transformers.tokenization_utils_base import AddedToken, BatchEncoding
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from transformers.utils import logging
+from transformers.models.bart.tokenization_bart import BartTokenizer
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+# See all BART models at https://huggingface.co/models?filter=bart
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/vocab.json",
+        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json",
+        "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/vocab.json",
+        "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/vocab.json",
+        "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/vocab.json",
+        "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/merges.txt",
+        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt",
+        "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/merges.txt",
+        "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/merges.txt",
+        "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/merges.txt",
+        "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/merges.txt",
+    },
+    "tokenizer_file": {
+        "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/tokenizer.json",
+        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/tokenizer.json",
+        "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/tokenizer.json",
+        "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/tokenizer.json",
+        "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/tokenizer.json",
+        "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/tokenizer.json",
+    },
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "facebook/bart-base": 1024,
+    "facebook/bart-large": 1024,
+    "facebook/bart-large-mnli": 1024,
+    "facebook/bart-large-cnn": 1024,
+    "facebook/bart-large-xsum": 1024,
+    "yjernite/bart_eli5": 1024,
+}
+class BartCustomTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" BART tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2 tokenizer,
+    using byte-level Byte-Pair-Encoding.
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+    ```
+    >>> from transformers import BartTokenizerFast
+    >>> tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")
+    >>> tokenizer("Hello world")['input_ids']
+    [0, 31414, 232, 2]
+    >>> tokenizer(" Hello world")['input_ids']
+    [0, 20920, 232, 2]
+    ```
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+    <Tip>
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
+    </Tip>
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+            </Tip>
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+            </Tip>
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (BART tokenizer detect beginning of words by the preceding space).
+        trim_offsets (`bool`, *optional*, defaults to `True`):
+            Whether the post processing step should trim offsets to avoid including whitespaces.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask", "input_commonsense_relations", "commonsense_mask"]
+    slow_tokenizer_class = BartTokenizer
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        tokenizer_file=None,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        add_prefix_space=False,
+        trim_offsets=True,
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file,
+            merges_file,
+            tokenizer_file=tokenizer_file,
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            trim_offsets=trim_offsets,
+            **kwargs,
+        )
+        self.relational_kind_to_index = None
+        self.there_is_difference_between_relations = True
+        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
+        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
+            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
+            pre_tok_state["add_prefix_space"] = add_prefix_space
+            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
+        self.add_prefix_space = add_prefix_space
+        # the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__`
+        tokenizer_component = "post_processor"
+        tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
+        if tokenizer_component_instance:
+            state = json.loads(tokenizer_component_instance.__getstate__())
+            # The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class`
+            if "sep" in state:
+                state["sep"] = tuple(state["sep"])
+            if "cls" in state:
+                state["cls"] = tuple(state["cls"])
+            changes_to_apply = False
+            if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
+                state["add_prefix_space"] = add_prefix_space
+                changes_to_apply = True
+            if state.get("trim_offsets", trim_offsets) != trim_offsets:
+                state["trim_offsets"] = trim_offsets
+                changes_to_apply = True
+            if changes_to_apply:
+                component_class = getattr(processors, state.pop("type"))
+                new_value = component_class(**state)
+                setattr(self.backend_tokenizer, tokenizer_component, new_value)
+    def __call__(self, *args, **kwargs):
+        input_commonsense_relations = kwargs.get('input_commonsense_relations', None)
+        if 'input_commonsense_relations' in kwargs:
+            kwargs.pop('input_commonsense_relations')
+        out = super(BartCustomTokenizerFast, self).__call__(*args, **kwargs)
+        if out.get('input_commonsense_relations') is None:
+            out = self._post_process_tokenization(input_commonsense_relations, out)
+        return out
+    def set_known_relation_names(self, known_relations_names: List[str]):
+        self.relational_kind_to_index = {t: i + 1 for i, t in enumerate(known_relations_names)}
+    def set_operation_mode(self, there_is_difference_between_relations=True):
+        self.there_is_difference_between_relations = there_is_difference_between_relations
+    @property
+    def mask_token(self) -> str:
+        """
+        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
+        having been set.
+        BART tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
+        comprise the space before the *<mask>*.
+        """
+        if self._mask_token is None and self.verbose:
+            logger.error("Using mask_token, but it is not set yet.")
+            return None
+        return str(self._mask_token)
+    @mask_token.setter
+    def mask_token(self, value):
+        """
+        Overriding the default behavior of the mask token to have it eat the space before it.
+        This is needed to preserve backward compatibility with all the previously used models based on Bart.
+        """
+        # Mask token behave like a normal word, i.e. include the space before it
+        # So we set lstrip to True
+        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
+        self._mask_token = value
+    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+        if is_split_into_words and not self.add_prefix_space:
+            raise ValueError(
+                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+                "to use it with pretokenized inputs."
+            )
+        input_commonsense_relations = kwargs.get('input_commonsense_relations', None)
+        if 'input_commonsense_relations' in kwargs:
+            kwargs.pop('input_commonsense_relations')
+        out = super()._batch_encode_plus(*args, **kwargs)
+        if out.get('input_commonsense_relations') is None:
+            out = self._post_process_tokenization(input_commonsense_relations, out)
+        return out
+    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+        if is_split_into_words and not self.add_prefix_space:
+            raise ValueError(
+                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+                "to use it with pretokenized inputs."
+            )
+        input_commonsense_relations = kwargs.get('input_commonsense_relations', None)
+        if 'input_commonsense_relations' in kwargs:
+            kwargs.pop('input_commonsense_relations')
+        out = super()._encode_plus(*args, **kwargs)
+        if out.get('input_commonsense_relations') is None:
+            out = self._post_process_tokenization(input_commonsense_relations, out)
+        return out
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
+        if token_ids_1 is None:
+            return output
+        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. BART does not
+        make use of token type ids, therefore a list of zeros is returned.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+    def _post_process_tokenization(self, input_commonsense_relations, out: BatchEncoding) -> BatchEncoding:
+        new_input_relations = self.get_new_input_relation_kinds(
+            tokenizer_outputs=out, input_relations=input_commonsense_relations
+        )
+        #if new_input_relations is not None:
+        #    print('sum:', new_input_relations.sum())
+        out['input_commonsense_relations'] = new_input_relations
+        return out
+    def find_new_tokens_span_for_multiword(self, pair, aux_dict):
+        old_start, old_end = pair
+        #print('pair:', pair)
+        keys = list(aux_dict.keys())
+        #print('aux_dict:', aux_dict)
+        new_start, new_end = old_start, old_end
+        for (start, end) in keys:
+            #print('-----> (start, end)', (start, end))
+            #print('old_start, old_end:', old_start, old_end)
+            #print('start, end:', start, end)
+            if old_start >= start and old_end <= end:
+                new_start, new_end = start, end
+                break
+        return new_start, new_end
+    def find_new_tokens_incoming_span_for_multiword(self, pair, aux_dict):
+        old_start, old_end = pair
+        incoming_rels = list([coord for v in aux_dict.values() for coord, relation in v.items()])
+        new_start, new_end = old_start, old_end
+        for (start, end) in incoming_rels:
+            #print('-----> (start, end)', (start, end))
+            #print('old_start, old_end:', old_start, old_end)
+            #print('start, end:', start, end)
+            if old_start >= start and old_end <= end:
+                new_start, new_end = start, end
+                break
+        return new_start, new_end
+    def get_new_input_relation_kinds(
+            self,
+            tokenizer_outputs: BatchEncoding,
+            input_relations: Optional[List[Dict[Tuple[int, int], Dict[Tuple[int, int], str]]]] = None
+    ) -> torch.Tensor:
+        n_examples = len(tokenizer_outputs['input_ids'])
+        n_tokens = len(tokenizer_outputs['input_ids'][0])
+        aux_input_relation_kinds = np.zeros(
+            (n_examples, n_tokens, n_tokens),
+            dtype=np.int64
+        )
+        if not input_relations and input_relations is not None:
+            return torch.from_numpy(aux_input_relation_kinds)
+        elif not input_relations:
+            return None#torch.tensor([])
+        assert 'offset_mapping' in tokenizer_outputs, "Run tokenizer with return_offsets_mapping=True"
+        # print('aux_input_relation_kinds.shape', tokenizer_outputs['input_ids'].shape)
+        #print('input_relations:', input_relations)
+        if input_relations is not None:
+            # if input_relations is dirty, clean it
+            if isinstance(input_relations, dict):
+                input_relations = [input_relations]
+            mappings = tokenizer_outputs['offset_mapping']
+            assert len(mappings) == len(input_relations)
+            # print("to normal:", self.tokenizer.convert_ids_to_tokens(tokenizer_outputs['input_ids'][0]))
+            # print('words: ', words)
+            # print('x: ', mappings)
+            mappings = [[tuple(x) for x in mappings[idx].cpu().detach().tolist()] for idx in range(n_examples)]
+            # print(mappings)
+            examples_mappings = []
+            max_idx = 0
+            for idx, mapping in enumerate(mappings):
+                #print(idx, mapping)
+                words = tokenizer_outputs.word_ids(batch_index=idx)
+                tokens_to_words = deque(words)
+                token_idx_2_word_span = {}
+                for token_idx, (_char_i, _char_j) in enumerate(mapping):
+                    word_idx_of_token = tokens_to_words.popleft()
+                    if word_idx_of_token is None:
+                        continue
+                    token_span = tokenizer_outputs.word_to_chars(word_idx_of_token)
+                    token_idx_2_word_span[token_idx] = (token_span.start, token_span.end) # sera que tenho de tirar o menos 1 (estava -1)
+                    max_idx = max(token_idx, max_idx)
+                #print('token_idx_2_word_span:', token_idx_2_word_span)
+                ##### Multiword ######
+                token_idx_2_word_span_multiword = {}
+                d = input_relations[idx]
+                for k, v in token_idx_2_word_span.items():
+                    #print('k,v', k, v)
+                    new_start, new_end = self.find_new_tokens_span_for_multiword(v, d)
+                    token_idx_2_word_span_multiword[k] = (new_start, new_end)
+                    #print('tmp:', token_idx_2_word_span_multiword)
+                    #print('[before]token_idx_2_word_span_multiword[k]:', token_idx_2_word_span_multiword[k])
+                    if v[0]==new_start and v[1]==new_end:
+                        new_start, new_end = self.find_new_tokens_incoming_span_for_multiword(v, d)
+                        token_idx_2_word_span_multiword[k] = (new_start, new_end)
+                    #print('tmp2:', token_idx_2_word_span_multiword)
+                    #print('[after]token_idx_2_word_span_multiword[k]:', token_idx_2_word_span_multiword[k])
+                #####           ######
+                #print('token_idx_2_word_span_multiword:', token_idx_2_word_span_multiword)
+                examples_mappings.append(token_idx_2_word_span_multiword)
+            # print('len:', len(examples_mappings))
+            # print('max_idx: ', max_idx)
+            for i_example in range(n_examples):
+                token_idx_2_word_span = examples_mappings[i_example]
+                # print('token_idx_2_word_span: ', token_idx_2_word_span)
+                possible_relations = input_relations[i_example]
+                # print('possible_relations: ', possible_relations)
+                for token_i_idx in range(max_idx + 1):
+                    for token_j_idx in range(max_idx + 1):
+                        fixed_word_range = token_idx_2_word_span.get(token_i_idx, None)
+                        other_word_range = token_idx_2_word_span.get(token_j_idx, None)
+                        if not fixed_word_range or not other_word_range:
+                            continue
+                        #print(fixed_word_range, ' | ', other_word_range)
+                        relations = possible_relations.get(fixed_word_range, None)
+                        if not relations:
+                            continue
+                        #print('possible_relations:' , possible_relations)
+                        relation_kind = relations.get(other_word_range, None)
+                        if not relation_kind:
+                            continue
+                        #print('relation_kind:',relation_kind)
+                        if self.there_is_difference_between_relations:
+                            aux_input_relation_kinds[i_example, token_i_idx, token_j_idx] = self.relational_kind_to_index[relation_kind]
+                        else:
+                            # basic relation | only matters that relation exists between tokens
+                            aux_input_relation_kinds[i_example, token_i_idx, token_j_idx] = 1
+        aux_input_relation_kinds = torch.from_numpy(aux_input_relation_kinds)
+        return aux_input_relation_kinds
+    def create_commonsense_mask(self, tokenizer_outputs, commonsense_matrix, num_heads=16, specific_head=0):
+        bsz = len(tokenizer_outputs['input_ids'])
+        n_tokens = len(tokenizer_outputs['input_ids'][0])
+        commonsense_mask = np.zeros(
+            ((bsz, num_heads, n_tokens, n_tokens)),
+            dtype=np.int64
+        )
+        if commonsense_matrix is None:
+            commonsense_matrix = np.zeros(
+                ((bsz, n_tokens, n_tokens)),
+                dtype=np.int64
+            )
+        commonsense_mask = commonsense_mask.reshape((num_heads, bsz, n_tokens, n_tokens))
+        # commonsense_matrix.shape: (bsz, src_len, tgt_len)
+        #print('commonsense_matrix:', commonsense_matrix)
+        commonsense_mask[specific_head] = commonsense_matrix
+        commonsense_mask = commonsense_mask.reshape((bsz, num_heads, n_tokens, n_tokens))
+        return commonsense_mask

data/__init__.py ADDED Viewed

File without changes

data/relation_utils.py ADDED Viewed

	@@ -0,0 +1,53 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from collections import deque
+from ast import literal_eval
+# Remote modules
+import torch
+# Local modules
+#############################
+#   Constants
+#############################
+##########################################################
+#   Helper functions for Relations in dict format
+##########################################################
+def clean_relations(word_relations):
+    new_relations = deque()
+    for r in word_relations:
+        rel = {}
+        for r_key, r_value in r.items():
+            normal_k = literal_eval(r_key)
+            rel_d = {}
+            for r_d_key, r_d_value in r_value.items():
+                normal_d_k = literal_eval(r_d_key)
+                rel_d[normal_d_k] = r_d_value
+            rel[normal_k] = rel_d
+        new_relations.append(rel)
+    list_new_relations = list(new_relations)
+    return list_new_relations
+##########################################################
+#   Helper functions for Relations in Matrix format
+##########################################################
+def relation_binary_2d_to_1d(relations_binary_mask, dim=1):
+    relations_binary_mask = relations_binary_mask.sum(dim=dim)
+    relations_binary_mask[relations_binary_mask > 1] = 1
+    return relations_binary_mask
+def tokens_with_relations(relations_binary_mask):
+    relations_binary_mask_dim1 = relations_binary_mask.sum(dim=0)
+    relations_binary_mask_dim2 = relations_binary_mask.sum(dim=1)
+    tokens_with_rels = relations_binary_mask_dim1 + relations_binary_mask_dim2
+    tokens_with_rels[tokens_with_rels > 1] = 1
+    mask_rels = torch.tensor(tokens_with_rels, dtype=torch.bool)
+    return mask_rels

inference.py ADDED Viewed

	@@ -0,0 +1,349 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from typing import List
+# Remote modules
+import numpy as np
+import torch
+# Local modules
+from kgs_binding.relation_mapper_builder import RelationsMapperBuilder
+from kgs_binding.kg_qa_binding_utils import load_kg_handler
+from data.relation_utils import clean_relations
+from model_utils import create_layers_head_mask
+from transformers import (
+    BartForConditionalGeneration,
+    BartTokenizer,
+    BartConfig,
+    DisjunctiveConstraint,
+)
+from utils import get_jump_chunks
+#############################
+#   Constants
+#############################
+#############################
+#   Stuff
+#############################
+from custom_tokenizer import BartCustomTokenizerFast
+from custom_bart import BartCustomConfig, BartCustomForConditionalGeneration
+from utils import get_device, KGType, Model_Type
+from kgs_binding.kg_base_wrapper import KGBaseHandler
+from kgs_binding.swow_handler import SwowHandler
+from kgs_binding.conceptnet_handler import ConceptNetHandler
+class Inference:
+    def __init__(self, model_path:str, max_length=32):
+        self.device = get_device()
+        self.tokenizer = self.prepare_tokenizer()
+        self.model = self.prepare_model(model_path)
+        self.max_length = max_length
+    def prepare_tokenizer(self):
+        tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+        return tokenizer
+    def prepare_model(self, model_path):
+        config = BartConfig.from_pretrained(model_path)
+        model = BartForConditionalGeneration.from_pretrained(model_path, config=config).to(self.device)
+        model.eval()
+        return model
+    def pre_process_context(self, context):
+        context = context.lower()
+        context_tokenized = self.tokenizer(context, padding='max_length',
+                                truncation='longest_first',  max_length=self.max_length,
+                                return_tensors="pt",
+                                )
+        return context_tokenized
+    def generate_based_on_context(self, context):
+        model_input = self.pre_process_context(context)
+        generated_answers_encoded = self.model.generate(input_ids=model_input["input_ids"].to(self.device),
+                                                   attention_mask=model_input["attention_mask"].to(self.device),
+                                                   min_length=1,
+                                                   max_length=self.max_length,
+                                                   do_sample=True,
+                                                   early_stopping=True,
+                                                   num_beams=4,
+                                                   temperature=1.0,
+                                                   top_k=None,
+                                                   top_p=None,
+                                                   # eos_token_id=tokenizer.eos_token_id,
+                                                   no_repeat_ngram_size=2,
+                                                   num_return_sequences=1,
+                                                   return_dict_in_generate=True,
+                                                   output_attentions=True,
+                                                   output_scores=True)
+        # print(f'Scores: {generated_answers_encoded}')
+        response = self.tokenizer.batch_decode(generated_answers_encoded['sequences'], skip_special_tokens=True,
+                                          clean_up_tokenization_spaces=True)
+        encoder_attentions = generated_answers_encoded['encoder_attentions']
+        return response, encoder_attentions, model_input
+    def prepare_context_for_visualization(self, context):
+        examples = []
+        response, encoder_outputs, model_input = self.generate_based_on_context(context)
+        encoder_outputs = torch.stack(encoder_outputs)
+        n_layers, batch_size, n_heads, src, tgt = encoder_outputs.size()
+        print(encoder_outputs.size())
+        encoder_attentions = encoder_outputs.view(batch_size, n_layers, n_heads, src, tgt)
+        for i, ex in enumerate(encoder_attentions):
+            d = {}
+            indices = model_input['input_ids'][i].detach().cpu()
+            all_tokens = self.tokenizer.convert_ids_to_tokens(indices)
+            useful_indeces = indices != self.tokenizer.pad_token_id
+            all_tokens = np.array(all_tokens)[useful_indeces]
+            all_tokens = [tok.replace('Ġ', '') for tok in all_tokens]
+            d['words'] = all_tokens
+            d['attentions'] = ex.detach().cpu().numpy()
+            examples.append(d)
+        print(d['words'])
+        return response, examples
+class RelationsInference:
+    def __init__(self, model_path:str, kg_type: KGType, model_type:Model_Type, max_length=32):
+        self.device = get_device()
+        kg_handler: KGBaseHandler = load_kg_handler(kg_type)
+        self.kg_handler = kg_handler
+        relation_names = kg_handler.get_relation_types()
+        self.tokenizer = self.prepare_tokenizer(relation_names, model_type)
+        self.simple_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+        self.model, self.config = self.prepare_model(relation_names, model_path, model_type)
+        self.relation_mapper_builder = RelationsMapperBuilder(knowledge=kg_handler)
+        self.max_length = max_length
+    def prepare_tokenizer(self, relation_names: List[str], model_type:Model_Type):
+        tokenizer = BartCustomTokenizerFast.from_pretrained('facebook/bart-large')
+        tokenizer.set_known_relation_names(relation_names)
+        tokenizer.set_operation_mode(there_is_difference_between_relations=model_type.there_is_difference_between_relations())
+        return tokenizer
+    def prepare_model(self, relation_names: List[str], model_path, model_type:Model_Type):
+        config = BartCustomConfig.from_pretrained(model_path, revision='master')
+        print('config.heads_mask:', config.heads_mask)
+        if config.num_relation_kinds is None:
+            config.num_relation_kinds = len(relation_names)
+        if config.is_simple_mask_commonsense is None:
+            config.is_simple_mask_commonsense = model_type.is_simple_mask_commonsense()
+        if config.heads_mask is None:
+            config.heads_mask = create_layers_head_mask(config)#, heads_mask_type, specific_heads)
+        model = BartCustomForConditionalGeneration.from_pretrained(model_path, config=config, revision='master').to(self.device)
+        model.eval()
+        return model, config
+    def pre_process_context(self, context):
+        context = context.lower()
+        # process context in search for relations
+        commonsense_relations = self.relation_mapper_builder.get_relations_mapping_complex(context=[context], clear_common_wds=True)
+        # clean relation
+        commonsense_relation = clean_relations(commonsense_relations)[0]
+        # convert this relations to matrices
+        print(commonsense_relation)
+        context_tokenized = self.tokenizer(context, padding='max_length',
+                                truncation='longest_first',  max_length=self.max_length,
+                                return_tensors="pt", return_offsets_mapping=True,
+                                input_commonsense_relations=commonsense_relation,
+        )
+        return context_tokenized
+    def get_relations_information(self, phrase_generated):
+        all_concepts = self.relation_mapper_builder.get_kg_concepts_from_context([phrase_generated], clear_common_wds=True)[0]
+        words = phrase_generated.strip().split(' ') # all words
+        concepts_with_relations = self.relation_mapper_builder.get_concepts_from_context(phrase_generated, clear_common_wds=True)
+        concepts_with_no_relations = list(set(all_concepts).difference(concepts_with_relations))
+        #print('without_relations:', concepts_with_no_relations)
+        print("====== RELATIONS SUMMARY ======")
+        print('phrase_generated:', phrase_generated)
+        print('words:', words)
+        print('all_concepts:', all_concepts)
+        print('concepts_with_relations:', concepts_with_relations)
+        print('without_relations:', concepts_with_no_relations)
+        print("\n== STATS:")
+        print('n_words:', len(words))
+        print('n_concepts:', len(all_concepts))
+        print('n_concepts_with_relations:', len(concepts_with_relations))
+        print('n_c_without_relations:', len(concepts_with_no_relations))
+        print("====== ================= ======")
+        return words, all_concepts, concepts_with_relations, concepts_with_no_relations
+    def remove_subsets(self, l):
+        l2 = l[:]
+        for m in l:
+            for n in l:
+                if set(m).issubset(set(n)) and m != n:
+                    l2.remove(m)
+                    break
+        return l2
+    def generate_based_on_context(self, context, use_kg=False):
+        model_input = self.pre_process_context(context)
+        #print(model_input)
+        gen_kwargs = {}
+        if "input_commonsense_relations" in model_input:
+            #print(model_input['input_commonsense_relations'].sum())
+            gen_kwargs["relation_inputs"] = model_input.get("input_commonsense_relations").to(self.device)
+        constraints = None
+        if use_kg:
+            constraints = []
+            concepts_from_context = self.relation_mapper_builder.get_concepts_from_context(context=context, clear_common_wds=True)
+            useful_concepts = [self.relation_mapper_builder.knowledge.get_related_concepts(concept) for concept in concepts_from_context]
+            if not useful_concepts:
+                useful_concepts = [self.kg_handler.get_related_concepts(concept) for concept in concepts_from_context]
+            useful_concepts = [[f'{phrase}' for phrase in concepts] for concepts in useful_concepts] # add spaces
+            #useful_concepts = [[phrase for phrase in concepts if len(phrase.split(' ')) == 1] for concepts in useful_concepts]
+            #useful_concepts = list(itertools.chain.from_iterable(useful_concepts))
+            #print('useful_concepts:', useful_concepts[:5])
+            if concepts_from_context:
+                for context_concept, neighbour_concepts in zip(concepts_from_context, useful_concepts):
+                    print('neighbour:', neighbour_concepts[:20])
+                    #flexible_words = self.most_similar_words(context_concept, neighbour_concepts) # limit the upperbound
+                    #flexible_words = [word for word in flexible_words if word not in context_concept] # remove input concepts
+                    flexible_words = [word for word in neighbour_concepts if word not in context_concept]  # remove input concepts
+                    flexible_words_ids: List[List[int]] = self.simple_tokenizer(flexible_words, add_prefix_space=True,add_special_tokens=False).input_ids
+                    flexible_words_ids = self.remove_subsets(flexible_words_ids)
+                    #add_prefix_space=True
+                    #flexible_words_ids = [x for x in flexible_words_ids if len(x) == 1] # problem with subsets
+                    flexible_words_ids = flexible_words_ids[:10]
+                    print('flexible_words_ids:', flexible_words_ids[:3])
+                    constraint = DisjunctiveConstraint(flexible_words_ids)
+                    constraints.append(constraint)
+            else:
+                constraints = None
+        generated_answers_encoded = self.model.generate(input_ids=model_input["input_ids"].to(self.device),
+                                                   attention_mask=model_input["attention_mask"].to(self.device),
+                                                   constraints=constraints,
+                                                   min_length=1,
+                                                   max_length=self.max_length,
+                                                   do_sample=False,
+                                                   early_stopping=True,
+                                                   num_beams=8,
+                                                   temperature=1.0,
+                                                   top_k=None,
+                                                   top_p=None,
+                                                   # eos_token_id=tokenizer.eos_token_id,
+                                                   no_repeat_ngram_size=2,
+                                                   num_return_sequences=1,
+                                                   return_dict_in_generate=True,
+                                                   output_attentions=True,
+                                                   output_scores=True,
+                                                   **gen_kwargs,
+                                                   )
+        # print(f'Scores: {generated_answers_encoded}')
+        response = self.tokenizer.batch_decode(generated_answers_encoded['sequences'], skip_special_tokens=True,
+                                          clean_up_tokenization_spaces=True)
+        encoder_attentions = generated_answers_encoded['encoder_attentions']
+        return response, encoder_attentions, model_input
+    def get_related_concepts_list(self, knowledge, list_concepts):
+        other_concepts = []
+        for concept in list_concepts:
+            other_near_concepts = knowledge.get_related_concepts(concept)
+            other_concepts.extend(other_near_concepts)
+        return other_concepts
+    def generate_contrained_based_on_context(self, contexts, use_kg=True, max_concepts=1):
+        model_inputs = [self.pre_process_context(context) for context in contexts]
+        constraints = None
+        if use_kg:
+            constraints = []
+            concepts_from_contexts = [self.relation_mapper_builder.get_concepts_from_context(context=context, clear_common_wds=True) for context in contexts]
+            neighbours_contexts = []#[self.get_related_concepts_list(self.relation_mapper_builder.knowledge, context) for context in concepts_from_contexts]
+            if not neighbours_contexts:
+                neighbours_contexts = [self.get_related_concepts_list(self.kg_handler, context) for context in concepts_from_contexts]
+            all_constraints = []
+            for context_neighbours in neighbours_contexts:
+                # context_neighbours is a collection of concepts
+                # lets create sub collections of concepts
+                context_neighbours = [f' {concept}' for concept in context_neighbours if len(concept) > 3]
+                n_size_chuncks = len(context_neighbours) // max_concepts
+                n_size_chuncks = n_size_chuncks if n_size_chuncks > 0 else 1
+                sub_concepts_collection = list(get_jump_chunks(context_neighbours, jump=n_size_chuncks))
+                constraints = []
+                for sub_concepts in sub_concepts_collection[:max_concepts]:
+                    flexible_words_ids: List[List[int]] = self.tokenizer(sub_concepts,
+                                                                    add_special_tokens=False).input_ids  # add_prefix_space=True,
+                    # flexible_words_ids = self.remove_subsets(flexible_words_ids)
+                    flexible_words_ids = [[word_ids[0]] for word_ids in flexible_words_ids]
+                    disjunctive_set = list(map(list, set(map(frozenset, flexible_words_ids))))
+                    if not any(disjunctive_set):
+                        continue
+                    constraint = DisjunctiveConstraint(disjunctive_set)
+                    constraints.append(constraint)
+                if not any(constraints):
+                    constraints = None
+                all_constraints.append(constraints)
+        else:
+            all_constraints = None
+        if not all_constraints:
+            all_constraints = None
+        generated_answers_encoded = []
+        encoder_attentions_list = []
+        for i, contraints in enumerate(all_constraints):
+            #print('contraints.token_ids:', [x.token_ids for x in contraints])
+            gen_kwargs = {}
+            inputs = model_inputs[i]
+            if "input_commonsense_relations" in inputs:
+                # print(model_input['input_commonsense_relations'].sum())
+                gen_kwargs["relation_inputs"] = inputs.get("input_commonsense_relations").to(self.device)
+            #print('model_kwargs.get("attention_mask"):', model_kwargs.get("attention_mask"))
+            gen = self.model.generate(input_ids=inputs["input_ids"].to(self.device),
+                               attention_mask=inputs["attention_mask"].to(self.device),
+                               constraints=constraints,
+                               min_length=1,
+                               max_length=self.max_length,
+                               do_sample=False,
+                               early_stopping=True,
+                               num_beams=8,
+                               temperature=1.0,
+                               top_k=None,
+                               top_p=None,
+                               # eos_token_id=tokenizer.eos_token_id,
+                               no_repeat_ngram_size=2,
+                               num_return_sequences=1,
+                               return_dict_in_generate=True,
+                               output_attentions=True,
+                               output_scores=True,
+                               **gen_kwargs,
+            )
+            # print('[gen]:', gen)
+            # print(tokenizer.batch_decode(gen))
+            generated_answers_encoded.append(gen['sequences'][0].detach().cpu())
+            encoder_attentions_list.append(gen['encoder_attentions'][0].detach().cpu())
+        # print(f'Scores: {generated_answers_encoded}')
+        text_results = self.tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,
+                                          clean_up_tokenization_spaces=True)
+        return text_results, encoder_attentions_list, model_inputs
+    def prepare_context_for_visualization(self, context):
+        examples, relations = [], []
+        response, encoder_outputs, model_input = self.generate_based_on_context(context)
+        input_commonsense_relations = model_input.get("input_commonsense_relations")
+        encoder_outputs = torch.stack(encoder_outputs)
+        n_layers, batch_size, n_heads, src, tgt = encoder_outputs.size()
+        print(encoder_outputs.size())
+        encoder_attentions = encoder_outputs.view(batch_size, n_layers, n_heads, src, tgt)
+        for i, ex in enumerate(encoder_attentions):
+            d = {}
+            indices = model_input['input_ids'][i].detach().cpu()
+            all_tokens = self.tokenizer.convert_ids_to_tokens(indices)
+            useful_indeces = indices != self.tokenizer.pad_token_id
+            all_tokens = np.array(all_tokens)[useful_indeces]
+            all_tokens = [tok.replace('Ġ', '') for tok in all_tokens]
+            d['words'] = all_tokens
+            d['attentions'] = ex.detach().cpu().numpy()
+            examples.append(d)
+            relations.append(input_commonsense_relations[i])
+        print(d['words'])
+        return response, examples, relations

kgs_binding/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .kg_base_wrapper import KGBaseHandler
+from .relation_mapper_builder import RelationsMapperBuilder
+from . import *

kgs_binding/conceptnet/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import *

kgs_binding/conceptnet/conceptnet_english_noun_2_noun_relations.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b82686e2cb4a32a827d3c0a0c63a91d5d102fe5813fe898cabd9a117aa7374c
+size 186932142

kgs_binding/conceptnet/conceptnet_english_nouns.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b90ab07ca7445623bcd90489367c4016ca3b4ed743816a99b730f22e13ac339c
+size 140804377

kgs_binding/conceptnet/conceptnet_english_nouns_simple.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad6e76d470432dc3c9c7c0ebbf340eda3c4f69008c9f8a27df97f8e005e5db02
+size 22419586

kgs_binding/conceptnet_handler.py ADDED Viewed

	@@ -0,0 +1,61 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from typing import Tuple, Optional, List
+# Remote modules
+# Local modules
+from .kg_base_wrapper import KGBaseHandler
+from utils import read_json_file_2_dict
+#############################
+#   Constants
+#############################
+#############################
+#   Handler
+#############################
+class ConceptNetHandler(KGBaseHandler):
+    def __init__(self, database=""):
+        super(ConceptNetHandler, self).__init__()
+        _store_dir = 'kgs_binding/conceptnet'
+        self.conceptnet_concepts = read_json_file_2_dict('conceptnet_english_nouns_simple.json', store_dir=_store_dir)
+        self.relations_concepts = read_json_file_2_dict('conceptnet_english_noun_2_noun_relations.json', store_dir=_store_dir)
+        self.concept_2_concepts = read_json_file_2_dict('conceptnet_english_nouns.json', store_dir=_store_dir)
+    def get_relation_types(self) -> List[str]:
+        updated_relation_names = ['not_has_property', 'not_desires', 'external_u_r_l', 'created_by',
+                          'not_capable_of', 'antonym', 'has_first_subevent', 'located_near',
+                          'desires', 'has_prerequisite', 'has_last_subevent', 'synonym', 'is_a',
+                          'manner_of', 'has_a', 'motivated_by_goal', 'instance_of',
+                          'etymologically_derived_from', 'capable_of', 'for', 'at_location',
+                          'has_subevent', 'causes', 'has_context', 'symbol_of', 'derived_from',
+                          'made_of', 'causes_desire', 'has_property', 'similar_to', 'used_for', 'by',
+                          'entails', 'form_of', 'receives_action', 'distinct_from', 'related_to',
+                          'part_of', 'defined_as', 'etymologically_related_to']
+        return updated_relation_names
+    def exists_relation_between(self, concept, other_concept) -> bool:
+        left_2_right, right_2_left = self.relation_between(concept, other_concept)
+        return left_2_right is not None or right_2_left is not None
+    def relation_between(self, concept, other_concept) -> Tuple[Optional[str], Optional[str]]:
+        left_2_right_txt = f'{concept}|{other_concept}'
+        right_2_left_txt = f'{other_concept}|{concept}'
+        left_2_right_relations = self.relations_concepts.get(left_2_right_txt, None)
+        right_2_left_relations = self.relations_concepts.get(right_2_left_txt, None)
+        left_2_right_relation, right_2_left_relation = None, None
+        if left_2_right_relations:
+            left_2_right_relation = self.ignore_less_relevant_connection(left_2_right_relations)
+        if right_2_left_relations:
+            right_2_left_relation = self.ignore_less_relevant_connection(right_2_left_relations)
+        return left_2_right_relation, right_2_left_relation
+    def get_related_concepts(self, concept) -> Optional[List[str]]:
+        return self.concept_2_concepts.get(concept, [])
+    def does_concept_exist(self, concept) -> bool:
+        return concept in self.conceptnet_concepts

kgs_binding/english_stopwords.txt ADDED Viewed

	@@ -0,0 +1,1126 @@

+'ll
+'tis
+'twas
+'ve
+a
+a's
+able
+ableabout
+about
+above
+abroad
+abst
+accordance
+according
+accordingly
+across
+act
+actually
+ad
+added
+adj
+adopted
+ae
+af
+affected
+affecting
+ag
+ah
+ai
+ain't
+aint
+al
+all
+almost
+along
+alongside
+also
+although
+am
+amid
+amidst
+among
+amongst
+amoungst
+an
+and
+another
+any
+anybody
+anyhow
+anymore
+anyone
+anything
+anyway
+anyways
+anywhere
+ao
+apart
+apparently
+appear
+appreciate
+appropriate
+approximately
+aq
+ar
+are
+aren
+aren't
+arent
+arise
+around
+arpa
+as
+aside
+ask
+asked
+asking
+asks
+associated
+at
+au
+auth
+aw
+away
+awfully
+az
+b
+ba
+back
+backed
+backing
+backs
+bb
+bd
+be
+became
+because
+become
+becomes
+becoming
+been
+beforehand
+began
+beginning
+beginnings
+begins
+behind
+being
+beings
+believe
+below
+beside
+besides
+best
+better
+between
+beyond
+bf
+bg
+bh
+bi
+biol
+bj
+bm
+bn
+bo
+both
+br
+brief
+briefly
+bs
+bt
+but
+buy
+bv
+bw
+by
+bz
+c
+c'mon
+c's
+ca
+call
+came
+can
+can't
+cannot
+cant
+caption
+case
+cases
+cause
+causes
+cc
+cd
+certain
+certainly
+cf
+cg
+ch
+changes
+ci
+ck
+cl
+clear
+clearly
+cm
+cmon
+cn
+co
+co.
+com
+come
+comes
+con
+consequently
+contain
+containing
+contains
+copy
+corresponding
+could
+could've
+couldn
+couldn't
+couldnt
+cr
+cs
+cu
+currently
+cv
+cx
+cy
+cz
+d
+dare
+daren't
+darent
+de
+dear
+definitely
+describe
+described
+despite
+detail
+did
+didn
+didn't
+didnt
+differ
+different
+differently
+directly
+dj
+dk
+dm
+do
+does
+doesn
+doesn't
+doesnt
+doing
+don
+don't
+done
+dont
+downed
+downing
+due
+during
+dz
+e
+each
+ec
+ed
+edu
+ee
+eg
+eh
+either
+else
+elsewhere
+enough
+entirely
+er
+es
+especially
+et
+et-al
+etc
+even
+evenly
+ever
+evermore
+every
+everybody
+everyone
+everything
+everywhere
+ex
+exactly
+example
+except
+f
+fairly
+far
+farther
+felt
+few
+fewer
+ff
+fi
+fify
+fj
+fk
+fm
+fo
+for
+forever
+formerly
+forth
+found
+fr
+from
+front
+full
+fully
+further
+furthered
+furthering
+furthermore
+furthers
+fx
+g
+ga
+gave
+gb
+gd
+ge
+generally
+gf
+gg
+gh
+gi
+gl
+gm
+gmt
+gn
+go
+got
+gotten
+gov
+gp
+gq
+gr
+great
+greater
+greatest
+greetings
+group
+grouped
+grouping
+groups
+gs
+gt
+gu
+gw
+gy
+h
+had
+hadn't
+hadnt
+half
+happens
+hardly
+has
+hasn
+hasn't
+hasnt
+have
+haven
+haven't
+havent
+having
+he
+he'd
+he'll
+he's
+hed
+hell
+hello
+help
+hence
+her
+here
+here's
+hereafter
+hereby
+herein
+heres
+hereupon
+hers
+herself
+herse”
+hes
+hi
+hid
+high
+higher
+highest
+him
+himself
+himse”
+his
+hither
+hk
+hm
+hn
+hopefully
+how
+how'd
+how'll
+how's
+howbeit
+however
+hr
+ht
+htm
+hu
+i
+i'd
+i'll
+i'm
+i've
+i.e.
+id
+ie
+if
+ignored
+ii
+il
+ill
+im
+immediate
+immediately
+importance
+important
+in
+inasmuch
+inc
+inc.
+indeed
+index
+indicate
+indicated
+indicates
+information
+inner
+inside
+insofar
+instead
+int
+interest
+interested
+interesting
+interests
+into
+inward
+io
+iq
+ir
+is
+isn
+isn't
+isnt
+it
+it'd
+it'll
+it's
+itd
+itll
+its
+itself
+itse”
+ive
+j
+je
+jm
+jo
+join
+jp
+just
+k
+ke
+keep
+keeps
+kept
+kg
+kh
+ki
+kind
+km
+kn
+knew
+know
+known
+knows
+kp
+kr
+kw
+ky
+kz
+l
+la
+large
+largely
+last
+lately
+later
+latest
+latter
+latterly
+lb
+lc
+least
+less
+lest
+let
+let's
+lets
+li
+like
+liked
+likely
+likewise
+line
+lk
+ll
+look
+looking
+looks
+lower
+lr
+ls
+lt
+ltd
+lu
+lv
+ly
+m
+ma
+made
+mainly
+make
+makes
+making
+many
+may
+maybe
+mayn't
+maynt
+mc
+md
+me
+mean
+means
+meantime
+meanwhile
+member
+members
+merely
+mg
+mh
+might
+might've
+mightn't
+mightnt
+mil
+mill
+mine
+miss
+mk
+ml
+mm
+mn
+mo
+more
+moreover
+most
+mostly
+move
+mp
+mq
+mr
+mrs
+ms
+msie
+mt
+mu
+much
+mug
+must
+must've
+mustn't
+mustnt
+mv
+mw
+mx
+my
+myself
+myse”
+mz
+n
+na
+namely
+nay
+nc
+nd
+ne
+nearly
+necessarily
+necessary
+need
+needed
+needing
+needn't
+neednt
+needs
+neither
+net
+never
+neverf
+neverless
+nevertheless
+newer
+newest
+nf
+ng
+ni
+nl
+no
+no-one
+nobody
+non
+none
+nonetheless
+noone
+nor
+normally
+nos
+not
+noted
+nothing
+notwithstanding
+nowhere
+np
+nr
+nu
+null
+nz
+o
+obtain
+obtained
+obviously
+of
+off
+often
+oh
+ok
+okay
+om
+omitted
+on
+once
+one
+one's
+ones
+only
+onto
+open
+opened
+opening
+opens
+opposite
+or
+ord
+order
+ordered
+ordering
+orders
+org
+other
+others
+otherwise
+ought
+oughtn't
+oughtnt
+our
+ours
+ourselves
+out
+over
+overall
+owing
+own
+p
+pa
+part
+parted
+particular
+particularly
+parting
+parts
+past
+pe
+per
+perhaps
+pf
+pg
+ph
+pk
+pl
+place
+placed
+places
+please
+pm
+pmid
+pn
+pointed
+pointing
+poorly
+possible
+possibly
+potentially
+pp
+pr
+predominantly
+present
+presented
+presenting
+presents
+presumably
+previously
+primarily
+probably
+problem
+problems
+promptly
+proud
+provided
+provides
+pt
+put
+puts
+pw
+py
+q
+qa
+que
+quickly
+quite
+qv
+r
+rather
+rd
+re
+readily
+really
+reasonably
+recent
+recently
+ref
+refs
+regarding
+regardless
+regards
+related
+relatively
+reserved
+respectively
+resulted
+resulting
+results
+ro
+ru
+rw
+s
+sa
+said
+same
+saw
+saying
+says
+sb
+sc
+sd
+se
+sec
+section
+see
+seeing
+seem
+seemed
+seeming
+seems
+seen
+sees
+self
+selves
+sensible
+sent
+serious
+seriously
+several
+sg
+sh
+shall
+shan't
+shant
+she
+she'd
+she'll
+she's
+shed
+shell
+shes
+should
+should've
+shouldn
+shouldn't
+shouldnt
+showed
+showing
+shown
+showns
+si
+side
+sides
+significant
+significantly
+similar
+similarly
+since
+sincere
+site
+sj
+sk
+sl
+slightly
+sm
+sn
+so
+some
+somebody
+someday
+somehow
+someone
+somethan
+something
+sometime
+sometimes
+somewhat
+somewhere
+specifically
+specified
+specify
+specifying
+sr
+st
+state
+states
+still
+stop
+strongly
+su
+sub
+substantially
+successfully
+such
+sufficiently
+suggest
+sup
+sure
+sv
+sy
+sz
+t
+t's
+take
+taken
+taking
+tc
+td
+tell
+tends
+tf
+tg
+th
+than
+thank
+thanks
+thanx
+that
+that'll
+that's
+that've
+thatll
+thats
+thatve
+the
+their
+theirs
+them
+themselves
+then
+thence
+there
+there'd
+there'll
+there're
+there's
+there've
+thereafter
+thereby
+thered
+therefore
+therein
+therell
+thereof
+therere
+theres
+thereto
+thereupon
+thereve
+these
+they
+they'd
+they'll
+they're
+they've
+theyd
+theyll
+theyre
+theyve
+thick
+thin
+thing
+things
+think
+thinks
+third
+thirty
+this
+thorough
+thoroughly
+those
+thou
+though
+thoughh
+thought
+thoughts
+thousand
+throug
+through
+throughout
+thru
+thus
+til
+till
+tis
+tj
+tk
+tm
+tn
+to
+today
+together
+too
+took
+tp
+tr
+tried
+tries
+truly
+trying
+ts
+tt
+turn
+turned
+turning
+turns
+tw
+twas
+tz
+u
+ua
+ug
+uk
+um
+un
+underneath
+undoing
+unfortunately
+unless
+unlike
+unlikely
+until
+unto
+upon
+ups
+us
+use
+used
+useful
+usefully
+usefulness
+uses
+using
+usually
+uucp
+uy
+uz
+v
+va
+value
+various
+vc
+ve
+versus
+very
+vg
+vi
+via
+viz
+vn
+vol
+vols
+vs
+vu
+w
+want
+wanted
+wanting
+wants
+was
+wasn
+wasn't
+wasnt
+way
+ways
+we
+we'd
+we'll
+we're
+we've
+web
+wed
+welcome
+well
+wells
+went
+were
+weren
+weren't
+werent
+weve
+wf
+what
+what'd
+what'll
+what's
+what've
+whatever
+whatll
+whats
+whatve
+when
+when'd
+when'll
+when's
+whence
+whenever
+where
+where'd
+where'll
+where's
+whereafter
+whereas
+whereby
+wherein
+wheres
+whereupon
+wherever
+whether
+which
+whichever
+while
+whilst
+whim
+whither
+who
+who'd
+who'll
+who's
+whod
+whoever
+whole
+wholl
+whom
+whomever
+whos
+whose
+why
+why'd
+why'll
+why's
+widely
+width
+will
+willing
+with
+within
+without
+won
+won't
+wonder
+wont
+words
+worked
+working
+works
+world
+would
+would've
+wouldn
+wouldn't
+wouldnt
+ws
+www
+x
+y
+ye
+year
+years
+yes
+yet
+you
+you'd
+you'll
+you're
+you've
+youd
+youll
+your
+youre
+yours
+yourself
+yourselves
+youve
+yt
+yu
+z
+za
+zm
+zr

kgs_binding/kg_base_wrapper.py ADDED Viewed

	@@ -0,0 +1,80 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from abc import ABC, abstractmethod
+from typing import Tuple, Optional, List
+# Remote modules
+from nltk.stem import WordNetLemmatizer
+# Local modules
+#############################
+#   Constants
+#############################
+class KGBaseHandler(ABC):
+    def __init__(self):
+        super().__init__()
+        self.st = WordNetLemmatizer()
+    def normalize_noun(self, ent):
+        try:
+            noun = self.st.lemmatize(ent, pos='n')
+            noun = self.st.lemmatize(noun, pos='v')
+        except Exception as _:
+            noun = ent[:-1] if ent[-1] == 's' else ent
+        return noun
+    def normalize_nouns(self, ent):
+        local_ent = ent[:]
+        nouns = local_ent.split(' ')
+        if len(nouns) == 1:
+            return ' '.join([self.normalize_noun(e) for e in nouns])
+        return local_ent
+    def ignore_less_relevant_connection(self, relations):
+        if len(relations) >= 2:
+            for r in relations:
+                if r != 'related_to':
+                    return r
+        return relations[0]
+    @abstractmethod
+    def get_relation_types(self) -> List[str]:
+        pass
+    @abstractmethod
+    def exists_relation_between(self, concept, other_concept) -> bool:
+        pass
+    @abstractmethod
+    def relation_between(self, concept, other_concept) -> Tuple[Optional[str], Optional[str]]:
+        pass
+    @abstractmethod
+    def get_related_concepts(self, concept) -> Optional[List[str]]:
+        pass
+    @abstractmethod
+    def does_concept_exist(self, concept) -> bool:
+        pass
+class NoKnowledge(KGBaseHandler):
+    def __init__(self):
+        super(NoKnowledge, self).__init__()
+    def get_relation_types(self) -> List[str]:
+        return []
+    def exists_relation_between(self, concept, other_concept) -> bool:
+        return False
+    def relation_between(self, concept, other_concept) -> Tuple[Optional[str], Optional[str]]:
+        return (None, None)
+    def does_concept_exist(self, concept) -> bool:
+        return False

kgs_binding/kg_qa_binding_utils.py ADDED Viewed

	@@ -0,0 +1,73 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from typing import List, Tuple
+from enum import Enum
+# Remote modules
+# Local modules
+from .kg_base_wrapper import KGBaseHandler
+from .swow_handler import SwowHandler
+from .conceptnet_handler import ConceptNetHandler
+from utils import read_json_file_2_dict, Data_Type
+#############################
+#   Constants
+#############################
+#############################
+#   Stuff
+#############################
+class KGType(Enum):
+    SWOW = 'swow'
+    CSKG = 'cskg'
+    CONCEPTNET = 'conceptnet'
+def load_kg_handler(kg_type: KGType):
+    if kg_type.value == KGType.SWOW.value:
+        return SwowHandler()
+    elif kg_type.value == KGType.CONCEPTNET.value:
+        return ConceptNetHandler()
+    else:
+        raise NotImplementedError()
+def _load_data_paths_metadata():
+    try:
+        data = read_json_file_2_dict('data_config.json', store_dir='run_config')
+    except:
+        data = None
+    return data
+def from_relations_path_2_relations(dataset_types: List[Data_Type], metadata):
+    relations = []
+    print('metadata:', metadata)
+    for dataset_type in dataset_types:
+        qa_meta_data = metadata[dataset_type.value]
+        filename_path, dir_data = qa_meta_data['local']
+        print(filename_path, dir)
+        data = read_json_file_2_dict(filename_path, dir_data)
+        relations.extend(data)
+    return relations
+def KGHandler_to_str(kg_handler: KGBaseHandler) -> str:
+    if isinstance(kg_handler, SwowHandler):
+        return 'swow'
+    elif isinstance(kg_handler, ConceptNetHandler):
+        return 'conceptnet'
+    else:
+        raise NotImplementedError()
+def get_kg_qa_data_metadata(kg_handler: KGBaseHandler) -> Tuple[str, str]:
+    kg_qa_data_path = _load_data_paths_metadata()
+    if isinstance(kg_handler, SwowHandler):
+        swow = kg_qa_data_path["swow"]
+        return swow
+    elif isinstance(kg_handler, ConceptNetHandler):
+        conceptnet = kg_qa_data_path["conceptnet"]
+        return conceptnet
+    else:
+        raise NotImplementedError()

kgs_binding/parsing_utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+#############################
+#   Imports
+#############################
+# Python modules
+import re
+import string
+# Remote modules
+# Local modules
+from utils import (
+    read_simple_text_file_2_vec
+)
+#############################
+#   Utils
+#############################
+class ParsingUtils:
+    STOPWORDS = read_simple_text_file_2_vec('english_stopwords.txt', store_dir='kgs_binding')
+    @staticmethod
+    def remove_pontuation(text):
+        text = re.sub(r"[^a-zA-Z]", " ", text)
+        return text.translate(str.maketrans('', '', string.punctuation))
+    @staticmethod
+    def clear_common_words(index_with_words):
+        return [(word, (s, e)) for (word, (s, e)) in index_with_words if word not in ParsingUtils.STOPWORDS]
+    @staticmethod
+    def is_word_a_relevant_one(ignore_common_words, word):
+        if ignore_common_words:
+            return word not in ParsingUtils.STOPWORDS
+        else:
+            return True
+    @staticmethod
+    def get_word_range_mapping(context, word_token):
+        word_token_splitted = word_token.split(' ')
+        if len(word_token_splitted) == 1:
+            word_token_start = context.index(word_token)
+            word_token_end = word_token_start + len(word_token) - 1  # inclusive end
+        else:
+            word_token_start = context.index(word_token_splitted[0])
+            word_token_end = word_token_start + len(word_token) - 1  # inclusive end
+        return word_token_start, word_token_end
+    @staticmethod
+    def n_grams(words_vector, n):
+        grams = [words_vector[i:i + n] for i in range(len(words_vector) - n + 1)]
+        print(grams)
+        return [' '.join(x) for x in grams]
+    @staticmethod
+    def n_grams_with_idx(words_vector, n):
+        grams = [words_vector[i:i + n] for i in range(len(words_vector) - n + 1)]
+        return [(' '.join([pair[0] for pair in x]), (x[0][1], x[-1][1]+len(x[-1][0]))) for x in grams]
+    @staticmethod
+    def n_grams_context_producer_simple(context, n_gram=2):
+        context_tokens = context.strip().split(' ')
+        #context_tokens = [w for w in context_tokens if w not in STOPWORDS]
+        n_grams_context = []
+        for i in range(n_gram):
+            n_gram_content = ParsingUtils.n_grams(context_tokens, n_gram-i)
+            n_grams_context.append(n_gram_content)
+        return n_grams_context
+    @staticmethod
+    def n_grams_n_words_extractor(context, n_gram=3):
+        context_tokens = context.strip().split(' ')
+        context_tokens_with_index_info=[]
+        word_idx=0
+        for word in context_tokens:
+            context_tokens_with_index_info.append((word, word_idx))
+            word_idx += len(word) + 1
+        #context_tokens = [w for w in context_tokens if w not in STOPWORDS]
+        n_grams_context = []
+        for i in range(n_gram):
+            n_gram_content = ParsingUtils.n_grams_with_idx(context_tokens_with_index_info, n_gram-i)
+            n_grams_context.extend(n_gram_content)
+        return n_grams_context

kgs_binding/relation_mapper_builder.py ADDED Viewed

	@@ -0,0 +1,164 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from collections import deque
+from collections import defaultdict
+from typing import List, Dict, Optional
+from ast import literal_eval
+from random import sample
+# Remote modules
+# Local modules
+from .kg_base_wrapper import KGBaseHandler
+from .swow_handler import SwowHandler
+from utils import (
+    read_json_file_2_dict,
+    Data_Type,
+)
+from .parsing_utils import ParsingUtils
+#############################
+#   Constants
+#############################
+#############################
+#   Stuff
+#############################
+class RelationsMapperBuilder:
+    def __init__(self,  knowledge: KGBaseHandler,
+                        filename: Optional[str] = None,
+                        file_dir: Optional[str] = None,
+                        datatype: Optional[Data_Type] = None,
+                        tok_sep:str = '</s>',
+                        use_extra_relations=True):
+        self.tok_sep = tok_sep
+        self.knowledge = knowledge
+        self.swow_knowledge = SwowHandler()
+        self.use_extra_relations = use_extra_relations
+        if filename and file_dir and datatype:
+            full_context = self.load_data(filename, file_dir)
+            self.relevant_context = self.fetch_relevant_context_from_data(data=full_context, datatype=datatype)
+    def load_data(self, filename='commongen_qa_final.json', store_dir='./'):
+        data = read_json_file_2_dict(filename=filename, store_dir=store_dir)
+        print('data[0]:', data[0])
+        return data
+    def fetch_relevant_context_from_data(self, data: List[Dict], datatype:Data_Type = Data_Type.COMMONGEN_QA):
+        if datatype == Data_Type.COMMONGEN_QA:
+            model_input = [data_unit.get('title').lower() for data_unit in data]
+        elif datatype in [Data_Type.ELI5, Data_Type.STACK_EXCHANGE]:
+            model_input = [data_unit.get('question').lower() for data_unit in data]
+        elif datatype in [Data_Type.COMMONSENSE_QA]:
+            #questions = [data_unit.get('question').lower() for data_unit in data]
+            #model_input = datasets_parsing_utils.compose_commonsenseqa_data(data)
+            model_input = [data_unit.get('input_data') for data_unit in data]
+        elif datatype in [Data_Type.COMMONGEN]:
+            #questions = [data_unit.get('input_data').lower() for data_unit in data]
+            #model_input = datasets_parsing_utils.compose_commongen_data(data)
+            model_input = [data_unit.get('input_data') for data_unit in data]
+        else:
+            model_input = []
+        return model_input
+    def get_kg_concepts_from_context(self, context=None, clear_common_wds=False):
+        if not context:
+            context = self.relevant_context
+        context_words = []
+        for q_id, question in enumerate(context):
+            simple_question = ParsingUtils.remove_pontuation(question)
+            n_grams = ParsingUtils.n_grams_n_words_extractor(simple_question)
+            words = self.relevant_entities_extractor(n_grams)
+            if clear_common_wds:
+                words = ParsingUtils.clear_common_words(words)
+            simple_words = [word[0] for word in words]
+            context_words.append(simple_words)
+        return context_words
+    def obtain_concept_neighbours(self, context_concepts:List[str], n_neighbours = 20):
+        """
+        Use swow to get connected concepts, but then refer back to conceptnet for rich relations
+        """
+        neighbours = []
+        for concept in context_concepts:
+            external_neighbour_concepts = self.swow_knowledge.get_related_concepts(concept)
+            relevant_concepts = external_neighbour_concepts
+            #local_neighbour_concepts = self.knowledge.get_related_concepts(concept)
+            #relevant_concepts = [ext_concept for ext_concept in external_neighbour_concepts if ext_concept in local_neighbour_concepts]
+            neighbours.extend(relevant_concepts)
+        n_neighbours = min(n_neighbours, len(neighbours))
+        some_neighbours = sample(neighbours, n_neighbours)
+        #print('context_concepts:', context_concepts)
+        #print('some_neighbours:', some_neighbours)
+        return some_neighbours
+    def get_relations_mapping_complex(self, context=None, clear_common_wds=False):
+        if not context:
+            context = self.relevant_context
+        relations_info = deque()
+        for q_id, question in enumerate(context):
+            simple_question = ParsingUtils.remove_pontuation(question)
+            n_grams = ParsingUtils.n_grams_n_words_extractor(simple_question)
+            words = self.relevant_entities_extractor(n_grams)
+            if clear_common_wds:
+                words = ParsingUtils.clear_common_words(words)
+            #print(f'question: {question}')
+            #print(f'words: {words}')
+            relation_context_between_words = defaultdict(dict)
+            known_tokens = set()
+            for token_i, (first_word_token, first_word_range) in enumerate(words[:-1]):
+                known_tokens.add(first_word_token)
+                first_word_range_str = str(first_word_range)
+                # normalize
+                first_word_phrase_normalized = self.knowledge.normalize_nouns(first_word_token)
+                for (second_word_token, second_word_range) in [w for w in words[token_i + 1:] if w not in known_tokens]:
+                    second_word_range_str = str(second_word_range)
+                    second_word_phrase_normalized = self.knowledge.normalize_nouns(second_word_token)
+                    left_2_right, right_2_left = self.knowledge.relation_between(first_word_phrase_normalized, second_word_phrase_normalized)
+                    #print(first_word_token, second_word_token, left_2_right, right_2_left)
+                    if left_2_right:
+                        relation_context_between_words[first_word_range_str][second_word_range_str] = left_2_right
+                    if right_2_left:
+                        relation_context_between_words[second_word_range_str][first_word_range_str] = right_2_left
+            relations_info.append(dict(relation_context_between_words))
+        return list(relations_info)
+    def get_concepts_from_context(self, context=None, clear_common_wds=False,alignment=0):
+        relations_info = self.get_relations_mapping_complex(context=[context], clear_common_wds=clear_common_wds)
+        words = []
+        #print('relations_info here:', relations_info)
+        for rels in relations_info:
+            for coords, v in rels.items():
+                coords_tuple = literal_eval(coords)
+                i,j = coords_tuple
+                words.append(context[i+alignment:j+alignment])
+                for coords_other, rel in v.items():
+                    coords_other_tuple = literal_eval(coords_other)
+                    i_other, j_other = coords_other_tuple
+                    words.append(context[i_other+alignment: j_other+alignment])
+        returning_words = list(set(words))
+        #print('returning_words:', returning_words)
+        return returning_words
+    def relevant_entities_extractor(self, n_grams_n_words, verbose_output=True):
+        non_overlapping_knowledge = {}
+        # print(n_grams_n_words)
+        for concept, (idx_start, idx_end) in n_grams_n_words:
+            normalized_concept = self.knowledge.normalize_nouns(concept)
+            exists = self.knowledge.does_concept_exist(normalized_concept)
+            #print('exists: ', concept, normalized_concept, exists)
+            if exists and idx_start not in non_overlapping_knowledge and \
+                idx_end not in non_overlapping_knowledge:
+                non_overlapping_knowledge[idx_start] = (concept, idx_start, idx_end, 'start_idx')
+                non_overlapping_knowledge[idx_end] = (concept, idx_end, idx_end, 'end_idx')
+        if verbose_output:
+            return [(value[0], (value[1], value[2])) for k, value in sorted(non_overlapping_knowledge.items()) if value[-1] == 'start_idx']
+        else:
+            return [value[0] for k, value in sorted(non_overlapping_knowledge.items()) if value[-1] == 'start_idx']

kgs_binding/swow/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import *

kgs_binding/swow/swow_knowledge.json ADDED Viewed

The diff for this file is too large to render. See raw diff

kgs_binding/swow_handler.py ADDED Viewed

	@@ -0,0 +1,75 @@

+#############################
+#   Imports
+#############################
+# Python modules
+import random
+from typing import Tuple, Optional, List
+# Remote modules
+# Local modules
+from .kg_base_wrapper import KGBaseHandler
+from utils import read_json_file_2_dict
+#############################
+#   Constants
+#############################
+#############################
+#   Stuff
+#############################
+class SwowHandler(KGBaseHandler):
+    def __init__(self, store_dir='kgs_binding/swow'):
+        super(SwowHandler, self).__init__()
+        self.swow: dict = self.load_stored_data(store_dir=store_dir)
+    def get_relation_types(self) -> List[str]:
+        return ['related_to']
+    def load_stored_data(self, filename='swow_knowledge.json', store_dir='kgs_binding/swow'):
+        self.swow = read_json_file_2_dict(filename, store_dir)
+        return self.swow
+    def exists_relation_between(self, concept, other_concept):
+        connections = self.swow.get(concept)
+        if not connections:
+            return False
+        for connetion in connections:
+            if connetion == other_concept:
+                return True
+        return False
+    def does_concept_exist(self, concept):
+        return self.swow.get(concept, None) is not None
+    def relation_between(self, concept, other_concept) -> Tuple[Optional[str], Optional[str]]:
+        exists_left_right = self.exists_relation_between(concept, other_concept)
+        exists_right_left = self.exists_relation_between(other_concept, concept)
+        relation = None
+        if exists_left_right or exists_right_left:
+            relation = 'related_to'
+        return relation, relation
+    def get_related_concepts(self, concept) -> Optional[List[str]]:
+        return self.swow.get(concept, [])
+    def simple_knowledge_prediction(self, knowledge):
+        kw = list(knowledge)
+        idx = random.randint(0, len(knowledge)-1) # 0-1-2
+        kw[idx] = '<mask>'
+        textual_knowledge_input = f'{kw[0]} {kw[1]} {kw[2]}'
+        label = f'{knowledge[0]} {knowledge[1]} {knowledge[2]}'
+        return f'{textual_knowledge_input},{label}\n', label
+    def create_mask_knowledge_for_model(self):
+        with open(f'bart_input/swow_bart.txt', 'w') as f:
+            for subject, objects in self.swow.items():
+                for obj in objects:
+                    knowledge = (subject, 'is related to', obj)
+                    w_kw, label = self.simple_knowledge_prediction(knowledge)
+                    f.write(w_kw)

model_utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+#############################
+#   Imports
+#############################
+# Python modules
+from typing import List
+from random import randint
+# Remote modules
+import torch
+# Local modules
+from utils import Head_Mask
+#############################
+#   Constants
+#############################
+#############################
+#   Stuff
+#############################
+def create_layers_head_mask(config, head_mask_type: Head_Mask=Head_Mask.ALL, specific_heads: List[int] = None):
+    mask_heads = torch.zeros((config.encoder_layers, config.encoder_attention_heads))
+    if head_mask_type == Head_Mask.RANDOM:
+        for i in range(config.encoder_layers):
+            rand_idx = randint(0, config.encoder_attention_heads-1)
+            mask_heads[i, rand_idx] = 1
+    elif head_mask_type == Head_Mask.NONE:
+        mask_heads[:, :] = 1
+    elif head_mask_type == Head_Mask.ALL:
+        pass
+    elif head_mask_type == Head_Mask.SPECIFIC:
+        if specific_heads:
+            for layer_i in range(len(mask_heads)):
+                specific_head = specific_heads[layer_i] - 1
+                mask_heads[layer_i][specific_head] = 1
+        else:
+            mask_heads = torch.Tensor([[0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0],
+                [1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0],
+                [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0],
+                [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
+                [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
+                [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
+                [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0],
+                [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
+                [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1],
+                [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1],
+                [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]])
+    else:
+        raise NotImplementedError()
+    return mask_heads.tolist()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+transformers
+torch
+numpy
+matplotlib

utils.py ADDED Viewed

	@@ -0,0 +1,230 @@

+#############################
+#   Imports and Contants    #
+#############################
+# Python modules
+from enum import Enum
+import os
+import json
+import time
+# Remote packages
+import torch
+#############################
+#         utilities
+#############################
+class ScoringType(Enum):
+    DEFAULT = 'default'
+    MAX_PROB = 'max-prob'
+    INTERPOL = 'interpol'
+    CONSTRAINT = 'constraint'
+    MULTIPLE_CHOICE = 'multiple_choice'
+class LossType(Enum):
+    DEFAULT = 'default'
+    CP_RP_DEF = 'cp-rp-def'
+    CP_DEF = 'cp-def'
+    PRP_NRP_DEF = 'prp-nrp-def'
+class Head_Mask(Enum):
+    ALL = 'all'
+    NONE = 'none'
+    RANDOM = 'random'
+    SPECIFIC = 'specific'
+class KGType(Enum):
+    SWOW = 'swow'
+    CSKG = 'cskg'
+    CONCEPTNET = 'conceptnet'
+class Model_Type(Enum):
+    RELATIONS = 'relations'
+    MASK = 'mask'
+    DEFAULT = 'default'
+    def is_simple_mask_commonsense(self):
+        return self == Model_Type.MASK
+    def there_is_difference_between_relations(self):
+        return self == Model_Type.RELATIONS
+class Data_Type(Enum):
+    ELI5 = 'eli5'
+    COMMONSENSE_QA = 'commonsense_qa'
+    COMMONGEN_QA = 'commongen_qa'
+    STACK_EXCHANGE = 'stackexchange_qa'
+    ASK_SCIENCE = 'ask_science_qa'
+    NATURAL_QUESTIONS = 'natural_questions'
+    LAMA = 'lama'
+    CONCEPTNET = 'conceptnet'
+    CUSTOM = 'custom'
+    COMMONGEN = 'commongen'
+    @staticmethod
+    def data_types_to_str(data_types):
+        datasets_str = '-'.join([x.value for x in data_types])
+        return datasets_str
+#############################
+#         Models
+#############################
+MODELS_PRETRAINING_NAME = {
+    "bart_large": "facebook/bart-large",
+    "bart_large_fp32": "patrickvonplaten/bart-large-fp32",
+    "bart_large_tweak": "",
+    "bart_base": "facebook/bart-base"
+}
+CURRENT_PRETRAINING_NAME = MODELS_PRETRAINING_NAME.get('bart_large_fp32')
+#############################
+#   Files Managment         #
+#############################
+def create_directory(output_dir):
+    # Create output directory if needed
+    if not os.path.exists(output_dir):
+        try:
+            os.makedirs(output_dir)
+        except FileExistsError as _:
+            return
+    else:
+        print(f"Output directory {output_dir} already exists")
+def read_simple_text_file_2_vec(filename, store_dir='.'):
+    with open(f'{store_dir}/{filename}', 'r') as f:
+        return f.read().split('\n')
+def write_dict_2_json_file(json_object, filename, store_dir='.'):
+    create_directory(store_dir)
+    with open(f'{store_dir}/{filename}', 'w', encoding='utf-8') as file:
+        json.dump(json_object, file, ensure_ascii=False, indent=4)
+def read_json_file_2_dict(filename, store_dir='.'):
+    with open(f'{store_dir}/{filename}', 'r', encoding='utf-8') as file:
+        return json.load(file)
+def read_jsonl_file_2_dict(filename, store_dir='.'):
+    elements = []
+    with open(f'{store_dir}/{filename}', 'r', encoding='utf-8') as file:
+        for line in file:
+            elements.append(json.loads(line))
+        return elements
+def read_txt_2_list(filename, store_dir='.'):
+    with open(f'{store_dir}/{filename}', 'r', encoding='utf-8') as file:
+        return file.read().split('\n')
+#############################
+#           Data Structures helper functions
+#############################
+def get_chunks(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    jump = len(lst)//n
+    for i in range(0, len(lst), jump):
+        yield lst[i:i + jump]
+def get_jump_chunks(lst, jump):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), jump):
+        yield lst[i:i + jump]
+def join_str_first(sep_str, lis):
+    return '{1}{0}'.format(sep_str.join(lis), sep_str).strip()
+#############################
+#           Huggingface
+#############################
+def inputs_introspection_print(tokenizer, inputs):
+    input_ids = inputs.get('input_ids', None)
+    input_text = tokenizer.batch_decode(input_ids, skip_special_tokens=False)
+    labels_ids = inputs.get('labels', None)
+    labels_text = tokenizer.batch_decode(labels_ids, skip_special_tokens=False)
+    print('orginal input:', input_text[:2])
+    print("::::::::::::::::::::::::::")
+    print('orginal labels:', labels_text[:2])
+    print("==========|||||==========")
+def tok_data_2_text(tokenizer, all_inputs):
+    def clean_input_text(text):
+        real_text = text.split(tokenizer.eos_token)[0]
+        real_text = real_text.replace(tokenizer.bos_token, '').strip()
+        return real_text
+    all_input_text, all_labels_text = [], []
+    for inputs in all_inputs:
+        input_ids = inputs.get('input_ids', None)
+        input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
+        labels_ids = inputs.get('labels', None)
+        labels_text = tokenizer.decode(labels_ids, skip_special_tokens=True)
+        #print('input_text:', input_text)
+        #print('labels_text:', labels_text)
+        input_text = clean_input_text(input_text)
+        all_input_text.append(input_text)
+        all_labels_text.append(labels_text)
+    return all_input_text, all_labels_text
+#############################
+#           Torch
+#############################
+def get_device(verbose:bool=True):
+    # If there's a GPU available...
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        n_gpus = torch.cuda.device_count()
+        first_gpu = torch.cuda.get_device_name(0)
+        if verbose:
+            print(f'There are {n_gpus} GPU(s) available.')
+            print(f'GPU gonna be used: {first_gpu}')
+    else:
+        if verbose:
+            print('No GPU available, using the CPU instead.')
+        device = torch.device("cpu")
+    return device
+#############################
+#         Timing
+#############################
+def timing_decorator(func):
+    def wrapper(*args, **kwargs):
+        start = time.time()
+        original_return_val = func(*args, **kwargs)
+        end = time.time()
+        print("time elapsed in ", func.__name__, ": ", end - start, sep='')
+        return original_return_val
+    return wrapper
+#############################
+#         PRINTING UTILS
+#############################
+class LOGGER_COLORS:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    INFOCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+def print_info(logger, message):
+    logger.info(f'{LOGGER_COLORS.INFOCYAN}[INFO]{LOGGER_COLORS.ENDC}: {message}')
+def print_success(logger, message):
+    logger.info(f'{LOGGER_COLORS.OKGREEN}[SUCCESS]{LOGGER_COLORS.ENDC}: {message}')
+def print_warning(logger, message):
+    logger.info(f'{LOGGER_COLORS.WARNING}[WARNING]{LOGGER_COLORS.ENDC}: {message}')
+def print_fail(logger, message):
+    logger.info(f'{LOGGER_COLORS.FAIL}[FAIL]{LOGGER_COLORS.ENDC}: {message}')