Add model

Browse files

Files changed (11) hide show

config.json +22 -0
configuration.py +13 -0
model.py +35 -0
pipeline.py +116 -0
pytorch_model.bin +3 -0
special_tokens_map.json +7 -0
summary.py +106 -0
tokenizer_config.json +15 -0
transformerutils.py +65 -0
utilities.py +5 -0
vocab.txt +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "architectures": [
+    "BERTSummarizer"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration.ExtSummConfig",
+    "AutoModel": "model.BERTSummarizer"
+  },
+  "custom_pipelines": {
+    "summarization": {
+      "impl": "pipeline.ExtSummPipeline",
+      "pt": [
+        "AutoModel"
+      ],
+      "tf": []
+    }
+  },
+  "input_size": 512,
+  "model_type": "pubmedbert-bio-ext-summ",
+  "torch_dtype": "float32",
+  "transformers_version": "4.30.2"
+}

configuration.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from transformers import PretrainedConfig
+class ExtSummConfig(PretrainedConfig):
+    model_type = "pubmedbert-bio-ext-summ"
+    def __init__(
+        self,
+        input_size: int = 512,
+        **kwargs
+    ):
+        self.input_size = input_size
+        super().__init__(**kwargs)

model.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+from .transformerutils import TransformerInterEncoder
+from transformers import PreTrainedModel, AutoModel, BertConfig
+from .configuration import ExtSummConfig
+class BERTSummarizer(PreTrainedModel):
+    config_class = ExtSummConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = AutoModel.from_config(BertConfig.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"))
+        self.input_size = config.input_size
+        self.encoder = TransformerInterEncoder(self.bert.config.hidden_size, max_len=512)
+    def forward(self, batch):
+        document_ids = batch["ids"].to(self.bert.device)
+        segments_ids = batch["segments_ids"].to(self.bert.device)
+        clss_mask = batch["clss_mask"].to(self.bert.device)
+        attn_mask = batch["attn_mask"].to(self.bert.device)
+        tokens_out, _ = self.bert(input_ids=document_ids, token_type_ids=segments_ids, attention_mask=attn_mask, return_dict=False)
+        out = []
+        logits_out = []
+        for i in range(len(tokens_out)): # Batch handling
+            clss_out = tokens_out[i][clss_mask[i], :]
+            sentences_scores, logits = self.encoder(clss_out)
+            padding = torch.zeros(self.input_size - sentences_scores.shape[0]).to(sentences_scores.device)
+            out.append( torch.cat((sentences_scores, padding)) )
+            logits_out.append( torch.cat((logits, padding)) )
+        return torch.stack(out), torch.stack(logits_out)

pipeline.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from transformers import Pipeline
+import torch
+from .utilities import padToSize
+from .summary import select, splitDocument
+"""
+    Generates the segments ids for BERT
+"""
+def generateSegmentIds(doc_ids, tokenizer):
+    # Alternating 0s and 1s
+    segments_ids = [0] * len(doc_ids)
+    curr_segment = 0
+    for i, token in enumerate(doc_ids):
+        segments_ids[i] = curr_segment
+        if token == tokenizer.vocab["[SEP]"]:
+            curr_segment = 1 - curr_segment
+    return segments_ids
+class ExtSummPipeline(Pipeline):
+    """
+        Extractive summarization pipeline
+        Inputs
+        ------
+            inputs : dict
+                'sentences' : list[str]
+                    Sentences of the document
+            strategy : str
+                Strategy to summarize the document:
+                - 'length': summary with a maximum length (strategy_args is the maximum length).
+                - 'count': summary with the given number of sentences (strategy_args is the number of sentences).
+                - 'ratio': summary proportional to the length of the document (strategy_args is the ratio [0, 1]).
+                - 'threshold': summary only with sentences with a score higher than a given value (strategy_args is the minimum score).
+            strategy_args : any
+                Parameters of the strategy.
+        Outputs
+        -------
+            selected_sents : list[str]
+                List of the selected sentences
+            selected_idxs : list[int]
+                List of the indexes of the selected sentences in the original input
+    """
+    def _sanitize_parameters(self, **kwargs):
+        postprocess_kwargs = {}
+        if ("strategy" in kwargs and "strategy_args" not in kwargs) or ("strategy" not in kwargs and "strategy_args" in kwargs):
+            raise ValueError("`strategy` and `strategy_args` have to be both set")
+        if "strategy" in kwargs:
+            postprocess_kwargs["strategy"] = kwargs["strategy"]
+        if "strategy_args" in kwargs:
+            postprocess_kwargs["strategy_args"] = kwargs["strategy_args"]
+        return {}, {}, postprocess_kwargs
+    def preprocess(self, inputs):
+        sentences = inputs["sentences"]
+        # Tokenization and chunking
+        doc_tokens = self.tokenizer.tokenize( f"{self.tokenizer.sep_token}{self.tokenizer.cls_token}".join(sentences) )
+        doc_tokens = [self.tokenizer.cls_token] + doc_tokens + [self.tokenizer.sep_token]
+        doc_chunks = splitDocument(doc_tokens, self.tokenizer.cls_token, self.tokenizer.sep_token, self.model.config.input_size)
+        # Batch preparation
+        batch = {
+            "ids": [],
+            "segments_ids": [],
+            "clss_mask": [],
+            "attn_mask": [],
+        }
+        for chunk_tokens in doc_chunks:
+            doc_ids = self.tokenizer.convert_tokens_to_ids(chunk_tokens)
+            segment_ids = generateSegmentIds(doc_ids, self.tokenizer)
+            clss_mask = [True if token == self.tokenizer.cls_token_id else False for token in doc_ids]
+            attn_mask = [1 for _ in range(len(doc_ids))]
+            batch["ids"].append( padToSize(doc_ids, self.model.config.input_size, self.tokenizer.pad_token_id) )
+            batch["segments_ids"].append( padToSize(segment_ids, self.model.config.input_size, 0) )
+            batch["clss_mask"].append( padToSize(clss_mask, self.model.config.input_size, False) )
+            batch["attn_mask"].append( padToSize(attn_mask, self.model.config.input_size, 0) )
+        batch["ids"] = torch.as_tensor(batch["ids"])
+        batch["segments_ids"] = torch.as_tensor(batch["segments_ids"])
+        batch["clss_mask"] = torch.as_tensor(batch["clss_mask"])
+        batch["attn_mask"] = torch.as_tensor(batch["attn_mask"])
+        return { "inputs": batch, "sentences": sentences }
+    def _forward(self, args):
+        batch = args["inputs"]
+        sentences = args["sentences"]
+        out_predictions = torch.as_tensor([]).to(self.device)
+        self.model.eval()
+        with torch.no_grad():
+            batch_preds, _ = self.model(batch)
+            for i, clss_mask in enumerate(batch["clss_mask"]):
+                out_predictions = torch.cat((out_predictions, batch_preds[i][:torch.sum(clss_mask == True)]))
+        return { "predictions": out_predictions, "sentences": sentences }
+    def postprocess(self, args, strategy: str="count", strategy_args=3):
+        predictions = args["predictions"]
+        sentences = args["sentences"]
+        return select(sentences, predictions, strategy, strategy_args)

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4564a38bbef76f25a17e7254197bddd1f0635e10b4674556f453f98bc1e38108
+size 483701601

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

summary.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import torch
+def _selectStrategyLength(sentences, predictions, max_length):
+    selected_sents = []
+    sents_priority = torch.argsort(predictions, descending=True)
+    summary_len = 0
+    i = 0
+    while (summary_len < max_length) and (i < len(sents_priority)):
+        if summary_len + len(sentences[sents_priority[i]]) < max_length:
+            selected_sents.append(sents_priority[i].item())
+            summary_len += len(sentences[sents_priority[i]])
+        i += 1
+    return sorted(selected_sents)
+def _selectStrategyCount(sentences, predictions, num_sents):
+    selected_idxs = sorted(torch.topk(predictions, min(len(predictions), num_sents)).indices)
+    return [tensor.item() for tensor in selected_idxs]
+def _selectStrategyRatio(sentences, predictions, ratio):
+    doc_length = sum([ len(sent) for sent in sentences ])
+    return _selectStrategyLength(sentences, predictions, doc_length*ratio)
+def _selectStrategyThreshold(sentences, predictions, threshold):
+    return [i for i, score in enumerate(predictions) if score >= threshold]
+def select(sentences, predictions, strategy, strategy_args):
+    selected_sents = []
+    if strategy == "length":
+        selected_sents = _selectStrategyLength(sentences, predictions, strategy_args)
+    elif strategy == "count":
+        selected_sents = _selectStrategyCount(sentences, predictions, strategy_args)
+    elif strategy == "ratio":
+        selected_sents = _selectStrategyRatio(sentences, predictions, strategy_args)
+    elif strategy == "threshold":
+        selected_sents = _selectStrategyThreshold(sentences, predictions, strategy_args)
+    else:
+        raise NotImplementedError(f"Unknown strategy {strategy}")
+    return [sentences[i] for i in selected_sents], selected_sents
+"""
+    Splits a document in chunks of maximum a given size.
+    Parameters
+    ----------
+        doc_tokens : str[]
+            List of the tokens of the document.
+        bos_token : str
+            Begin of sentence token.
+        eos_token : str
+            End of sentence token.
+        max_size : int
+            Maximum size of a chunk.
+    Returns
+    -------
+        chunks : str[][]
+            Splitted document.
+"""
+def splitDocument(doc_tokens, bos_token, eos_token, max_size):
+    def _findNextBOSFrom(start_idx):
+        for i in range(start_idx, len(doc_tokens)):
+            if doc_tokens[i] == bos_token:
+                return i
+        return -1
+    def _findPreviousEOSFrom(start_idx):
+        for i in range(start_idx, -1, -1):
+            if doc_tokens[i] == eos_token:
+                return i
+        return -1
+    chunks = []
+    while len(doc_tokens) > max_size:
+        # Splits at the eos token
+        eos_idx = _findPreviousEOSFrom(max_size - 1)
+        if eos_idx == -1:
+            # The sentence is too long.
+            # Find the next bos in front of the current sentence (if exists) and truncate the current sentence.
+            next_bos_idx = _findNextBOSFrom(max_size)
+            if next_bos_idx != -1:
+                doc_tokens = doc_tokens[:max_size-1] + [eos_token] + doc_tokens[next_bos_idx:]
+            else:
+                doc_tokens = doc_tokens[:max_size-1] + [eos_token]
+            eos_idx = max_size - 1
+        chunks.append(doc_tokens[:eos_idx+1])
+        doc_tokens = doc_tokens[eos_idx+1:]
+    if len(doc_tokens) > 0: chunks.append(doc_tokens) # Remaining part of the document
+    return chunks

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

transformerutils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+import math
+# Source: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
+        pe = torch.zeros(max_len, d_model)
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        x = x + self.pe[:x.size(0)]
+        return self.dropout(x)
+"""
+    Same scheduler as in "Attention Is All You Need"
+"""
+class NoamScheduler():
+    def __init__(self, optimizer, warmup, model_size):
+        self.epoch = 0
+        self.optimizer = optimizer
+        self.warmup = warmup
+        self.model_size = model_size
+    def step(self):
+        self.epoch += 1
+        new_lr = self.model_size**(-0.5) * min(self.epoch**(-0.5), self.epoch * self.warmup**(-1.5))
+        for param in self.optimizer.param_groups:
+            param["lr"] = new_lr
+"""
+    Encoders to attend sentence level features.
+"""
+class TransformerInterEncoder(nn.Module):
+    def __init__(self, d_model, d_ff=2048, nheads=8, num_encoders=2, dropout=0.1, max_len=512):
+        super().__init__()
+        self.positional_enc = PositionalEncoding(d_model, dropout, max_len)
+        self.encoders = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(d_model=d_model, nhead=nheads, dim_feedforward=d_ff),
+            num_layers=num_encoders
+        )
+        self.layer_norm = nn.LayerNorm(d_model)
+        self.linear = nn.Linear(d_model, 1)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        x = self.positional_enc(x)
+        x = self.encoders(x)
+        x = self.layer_norm(x)
+        logit = self.linear(x)
+        sentences_scores = self.sigmoid(logit)
+        return sentences_scores.squeeze(-1), logit.squeeze(-1)

utilities.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""
+    Pads a list to a given size
+"""
+def padToSize(to_pad_list, pad_size, filler):
+    return to_pad_list + [filler]*(pad_size-len(to_pad_list))

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff