Add files from rrivera1849/LUAR-MUD 858fcb1.

Browse files

Files changed (9) hide show

README.md +71 -3
config.json +16 -0
config.py +18 -0
merges.txt +0 -0
model.py +219 -0
special_tokens_map.json +51 -0
tokenizer.json +0 -0
tokenizer_config.json +58 -0
vocab.json +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,71 @@
----
-license: apache-2.0
----

+---
+language:
+- en
+license: apache-2.0
+---
+# LUAR-MUD development model forked from rrivera1849/LUAR-MUD
+Author Style Representations using [LUAR](https://aclanthology.org/2021.emnlp-main.70.pdf).
+The LUAR training and evaluation repository can be found [here](https://github.com/llnl/luar).
+This model was trained on the Reddit Million User Dataset (MUD) found [here](https://aclanthology.org/2021.naacl-main.415.pdf).
+## Usage
+```python
+from transformers import AutoModel, AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("rrivera1849/LUAR-MUD")
+model = AutoModel.from_pretrained("rrivera1849/LUAR-MUD")
+# we embed `episodes`, a colletion of documents presumed to come from an author
+# NOTE: make sure that `episode_length` consistent across `episode`
+batch_size = 3
+episode_length = 16
+text = [
+    ["Foo"] * episode_length,
+    ["Bar"] * episode_length,
+    ["Zoo"] * episode_length,
+]
+text = [j for i in text for j in i]
+tokenized_text = tokenizer(
+    text,
+    max_length=32,
+    padding="max_length",
+    truncation=True,
+    return_tensors="pt"
+)
+# inputs size: (batch_size, episode_length, max_token_length)
+tokenized_text["input_ids"] = tokenized_text["input_ids"].reshape(batch_size, episode_length, -1)
+tokenized_text["attention_mask"] = tokenized_text["attention_mask"].reshape(batch_size, episode_length, -1)
+print(tokenized_text["input_ids"].size())       # torch.Size([3, 16, 32])
+print(tokenized_text["attention_mask"].size())  # torch.Size([3, 16, 32])
+out = model(**tokenized_text)
+print(out.size())   # torch.Size([3, 512])
+# to get the Transformer attentions:
+out, attentions = model(**tokenized_text, output_attentions=True)
+print(attentions[0].size())     # torch.Size([48, 12, 32, 32])
+```
+## Citing & Authors
+If you find this model helpful, feel free to cite our [publication](https://aclanthology.org/2021.emnlp-main.70.pdf).
+```
+@inproceedings{uar-emnlp2021,
+  author    = {Rafael A. Rivera Soto and Olivia Miano and Juanita Ordonez and Barry Chen and Aleem Khan and Marcus Bishop and Nicholas Andrews},
+  title     = {Learning Universal Authorship Representations},
+  booktitle = {EMNLP},
+  year      = {2021},
+}
+```
+## License
+LUAR is distributed under the terms of the Apache License (Version 2.0).
+All new contributions must be made under the Apache-2.0 licenses.

config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "architectures": [
+    "LUAR"
+  ],
+  "auto_map": {
+    "AutoConfig": "config.LUARConfig",
+    "AutoModel": "model.LUAR"
+  },
+  "embedding_size": 512,
+  "k_bucket_size": 1024,
+  "model_type": "LUAR",
+  "q_bucket_size": 512,
+  "torch_dtype": "float32",
+  "transformers_version": "4.33.2",
+  "use_memory_efficient_attention": false
+}

config.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from transformers import PretrainedConfig
+class LUARConfig(PretrainedConfig):
+    model_type = "LUAR"
+    def __init__(self,
+        embedding_size: int = 512,
+        use_memory_efficient_attention=False,
+        q_bucket_size=512,
+        k_bucket_size=1024,
+        **kwargs,
+    ):
+        self.embedding_size = embedding_size
+        self.use_memory_efficient_attention = use_memory_efficient_attention
+        self.q_bucket_size = q_bucket_size
+        self.k_bucket_size = k_bucket_size
+        super().__init__(**kwargs)

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import math
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, reduce, repeat
+from torch.utils.checkpoint import checkpoint
+from transformers import AutoModel, PreTrainedModel
+from .config import LUARConfig
+# Adapted LucidRains impl. of Memory Efficient Attention
+# https://github.com/lucidrains/memory-efficient-attention-pytorch
+def exists(val):
+    return val is not None
+def summarize_qkv_chunk(
+    q, k, v,
+    mask
+):
+    """Dot-Product Attention for a chunk of queries, keys, and values.
+    """
+    weight = torch.einsum('b h i d, b h j d -> b h i j', q, k)
+    if exists(mask):
+        # HuggingFace masks have to be added:
+        weight += mask
+    weight_max = weight.amax(dim = -1, keepdim = True).detach()
+    weight = weight - weight_max
+    exp_weight = weight.exp()
+    weighted_value = torch.einsum('b h i j, b h j d -> b h i d', exp_weight, v)
+    return exp_weight.sum(dim = -1), weighted_value, rearrange(weight_max, '... 1 -> ...')
+checkpointed_summarize_qkv_chunk = partial(checkpoint, summarize_qkv_chunk)
+def memory_efficient_attention(
+    q, k, v,
+    mask = None,
+    q_bucket_size = 512,
+    k_bucket_size = 1024,
+    eps = 1e-8
+):
+    scale = q.shape[-1] ** -0.5
+    q = q * scale
+    # function
+    needs_backwards = q.requires_grad or k.requires_grad or v.requires_grad
+    summarize_qkv_fn = checkpointed_summarize_qkv_chunk if needs_backwards else summarize_qkv_chunk
+    # chunk all the inputs
+    q_chunks = q.split(q_bucket_size, dim = -2)
+    k_chunks = k.split(k_bucket_size, dim = -2)
+    v_chunks = v.split(k_bucket_size, dim = -2)
+    mask_chunks = mask.split(k_bucket_size, dim = -1) if exists(mask) else ((None,) * len(k_chunks))
+    # loop through all chunks and accumulate
+    out = []
+    for q_index, q_chunk in enumerate(q_chunks):
+        exp_weights = []
+        weighted_values = []
+        weight_maxes = []
+        for k_index, (k_chunk, v_chunk, mask_chunk) in enumerate(zip(k_chunks, v_chunks, mask_chunks)):
+            exp_weight_chunk, weighted_value_chunk, weight_max_chunk = summarize_qkv_fn(
+                q_chunk,
+                k_chunk,
+                v_chunk,
+                mask_chunk,
+            )
+            exp_weights.append(exp_weight_chunk)
+            weighted_values.append(weighted_value_chunk)
+            weight_maxes.append(weight_max_chunk)
+        exp_weights = torch.stack(exp_weights, dim = -1)
+        weighted_values = torch.stack(weighted_values, dim = -1)
+        weight_maxes = torch.stack(weight_maxes, dim = -1)
+        global_max = weight_maxes.amax(dim = -1, keepdim = True)
+        renorm_factor = (weight_maxes - global_max).exp().detach()
+        exp_weights = exp_weights * renorm_factor
+        weighted_values = weighted_values * rearrange(renorm_factor, '... c -> ... 1 c')
+        all_values = weighted_values.sum(dim = -1)
+        all_weights = exp_weights.sum(dim = -1)
+        normalized_values = all_values / (rearrange(all_weights, '... -> ... 1') + eps)
+        out.append(normalized_values)
+    return torch.cat(out, dim=-2)
+class SelfAttention(nn.Module):
+    """Implements Dot-Product Self-Attention as used in "Attention is all You Need".
+    """
+    def __init__(
+            self,
+            memory_efficient_attention=False,
+            q_bucket_size=512,
+            k_bucket_size=1024,
+        ):
+        super(SelfAttention, self).__init__()
+        self.use_memory_efficient_attention = memory_efficient_attention
+        self.q_bucket_size = q_bucket_size
+        self.k_bucket_size = k_bucket_size
+    def forward(self, k, q, v):
+        if self.use_memory_efficient_attention:
+            q, k, v = map(
+                lambda t: rearrange(t, 'b n (h d) -> b h n d', h = 12),
+                (q, k, v)
+            )
+            out = memory_efficient_attention(
+                q, k, v,
+                q_bucket_size=self.q_bucket_size,
+                k_bucket_size=self.k_bucket_size
+            )
+            out = rearrange(out, 'b h n d -> b n (h d)')
+            return out
+        else:
+            d_k = q.size(-1)
+            scores = torch.matmul(k, q.transpose(-2, -1)) / math.sqrt(d_k)
+            p_attn = F.softmax(scores, dim=-1)
+            return torch.matmul(p_attn, v)
+class LUAR(PreTrainedModel):
+    """Defines the LUAR model.
+    """
+    config_class = LUARConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.create_transformer()
+        self.attn_fn = SelfAttention(
+            config.use_memory_efficient_attention,
+            config.q_bucket_size,
+            config.k_bucket_size,
+        )
+        self.linear = nn.Linear(self.hidden_size, config.embedding_size)
+    def create_transformer(self):
+        """Creates the Transformer backbone.
+        """
+        self.transformer = AutoModel.from_pretrained("sentence-transformers/paraphrase-distilroberta-base-v1")
+        self.hidden_size = self.transformer.config.hidden_size
+        self.num_attention_heads = self.transformer.config.num_attention_heads
+        self.dim_head = self.hidden_size // self.num_attention_heads
+    def mean_pooling(self, token_embeddings, attention_mask):
+        """Mean Pooling as described in the SBERT paper.
+        """
+        input_mask_expanded = repeat(attention_mask, 'b l -> b l d', d=self.hidden_size).type(token_embeddings.type())
+        sum_embeddings = reduce(token_embeddings * input_mask_expanded, 'b l d -> b d', 'sum')
+        sum_mask = torch.clamp(reduce(input_mask_expanded, 'b l d -> b d', 'sum'), min=1e-9)
+        return sum_embeddings / sum_mask
+    def get_episode_embeddings(self, input_ids, attention_mask, output_attentions=False, document_batch_size=0):
+        """Computes the Author Embedding.
+        """
+        B, E, _ = attention_mask.shape
+        input_ids = rearrange(input_ids, 'b e l -> (b e) l')
+        attention_mask = rearrange(attention_mask, 'b e l -> (b e) l')
+        if document_batch_size > 0:
+            outputs = {"last_hidden_state": [], "attentions": []}
+            for i in range(0, len(input_ids), document_batch_size):
+                out = self.transformer(
+                    input_ids=input_ids[i:i+document_batch_size],
+                    attention_mask=attention_mask[i:i+document_batch_size],
+                    return_dict=True,
+                    output_hidden_states=False,
+                    output_attentions=output_attentions,
+                )
+                outputs["last_hidden_state"].append(out["last_hidden_state"])
+                if output_attentions:
+                    outputs["attentions"].append(out["attentions"])
+            outputs["last_hidden_state"] = torch.cat(outputs["last_hidden_state"], dim=0)
+            if output_attentions:
+                outputs["attentions"] = tuple([torch.cat([x[i] for x in outputs["attentions"]], dim=0) for i in range(len(outputs["attentions"][0]))])
+        else:
+            outputs = self.transformer(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                return_dict=True,
+                output_hidden_states=False,
+                output_attentions=output_attentions,
+            )
+        # at this point, we're embedding individual "comments"
+        comment_embeddings = self.mean_pooling(outputs['last_hidden_state'], attention_mask)
+        comment_embeddings = rearrange(comment_embeddings, '(b e) l -> b e l', b=B, e=E)
+        # aggregate individual comments embeddings into episode embeddings
+        episode_embeddings = self.attn_fn(comment_embeddings, comment_embeddings, comment_embeddings)
+        episode_embeddings = reduce(episode_embeddings, 'b e l -> b l', 'max')
+        episode_embeddings = self.linear(episode_embeddings)
+        if output_attentions:
+            return episode_embeddings, outputs["attentions"]
+        return episode_embeddings
+    def forward(self, input_ids, attention_mask, output_attentions=False, document_batch_size=0):
+        """Calculates a fixed-length feature vector for a batch of episode samples.
+        """
+        output = self.get_episode_embeddings(input_ids, attention_mask, output_attentions, document_batch_size)
+        return output

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "full_tokenizer_file": null,
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff