First commit with DOME model

Browse files

Signed-off-by: egor <egorbu@gmail.com>

Files changed (8) hide show

README.md +224 -0
config.json +27 -0
merges.txt +0 -0
pytorch_model.bin +3 -0
special_tokens_map.json +1 -0
tokenizer.json +0 -0
tokenizer_config.json +1 -0
vocab.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,224 @@

+# DOME wrapper for docstring intent classification
+This wrapper allows to
+* split docstrings into sentences
+* convert to required DOME inputs
+* predict class for each sentence in docstring
+## Model architecture
+Architecture is based on https://github.com/ICSE-DOME/DOME.
+## Usage
+```python
+docstring = "sentences of docstring"
+dome = DOME("dome_location")
+sentences, predictions = dome.predict(docstring)
+```
+## Dependencies
+```
+spacy
+torch
+transformers
+```
+## Code of the model
+````python
+"""
+Model is based on replication package for ICSE23 Paper Developer-Intent Driven Code Comment Generation.
+Initial solution: https://github.com/ICSE-DOME/DOME
+Pipeline consists of several parts:
+* split docstring into sentences
+* prepare input data for DOMEBertForClassification
+* predict class
+How to use:
+```python
+docstring = "sentences of docstring"
+dome = DOME("dome_location")
+sentences, predictions = dome.predict(docstring)
+```
+"""
+import re
+from typing import Tuple, List
+import spacy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoTokenizer, RobertaConfig, RobertaModel
+MAX_LENGTH_BERT = 510
+class DOME:
+    """
+    End-to-end pipeline for docstring classification
+    * split sentences
+    * prepare inputs
+    * classify
+    """
+    def __init__(self, pretrained_model: str):
+        """
+        :param pretrained_model: location of pretrained model
+        """
+        self.model = DOMEBertForClassification.from_pretrained(pretrained_model)
+        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
+        self.docstring2sentences = Docstring2Sentences()
+    def predict(self, docstring: str) -> Tuple[List[str], List[str]]:
+        """
+        Predict DOME classes for each sentence in docstring.
+        :param docstring: docstring to process
+        :return: tuple with list of sentences and list of predictions for each sentence.
+        """
+        sentences = self.docstring2sentences.docstring2sentences(docstring)
+        predictions = [self.model.predict(*dome_preprocess(tokenizer=self.tokenizer, comment=sentence))
+                       for sentence in sentences]
+        return sentences, predictions
+class DOMEBertForClassification(RobertaModel):
+    """
+    A custom classification model based on the RobertaModel for intent classification.
+    This model extends the RobertaModel with additional linear layers to incorporate
+    comment length as an additional feature for classification tasks.
+    """
+    DOME_CLASS_NAMES = ["what", "why", "how-to-use", "how-it-is-done", "property", "others"]
+    def __init__(self, config: RobertaConfig):
+        """
+        Initialize the DOMEBertForClassification model.
+        :param config: The configuration information for the RobertaModel.
+        """
+        super().__init__(config)
+        # I omit possibility to configure number of classes and so on because we need to load pretrained model
+        # DOME layers for intent classification:
+        self.fc1 = nn.Linear(768 + 1, 768 // 3)
+        self.fc2 = nn.Linear(768 // 3, 6)
+        self.dropout = nn.Dropout(0.2)
+    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None, comment_len: torch.Tensor = None) \
+            -> torch.Tensor:
+        """
+        Forward pass for the DOMEBertForClassification model.
+        :param input_ids: Tensor of token ids to be fed to a model.
+        :param attention_mask: Mask to avoid performing attention on padding token indices. Always equals 1.
+        :param comment_len: Tensor representing the length of comments. Equal 1 if comment has less than 3 words,
+                            0 otherwise.
+        :return: The logits after passing through the model.
+        """
+        # Use the parent class's forward method to get the base outputs
+        outputs = super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )
+        # Extract the pooled output (last hidden state of the [CLS] token)
+        pooled_output = outputs.pooler_output
+        # DOME custom layers:
+        comment_len = comment_len.view(-1, 1).float()  # Ensure comment_len is correctly shaped
+        # DOME use comment len as additional feature
+        combined_input = torch.cat([pooled_output, comment_len], dim=-1)
+        x = self.dropout(F.relu(self.fc1(self.dropout(combined_input))))
+        logits = self.fc2(x)
+        return logits
+    def predict(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None, comment_len: torch.Tensor = None) \
+            -> str:
+        """
+        Predict class for tokenized docstring.
+        :param input_ids: Tensor of token ids to be fed to a model.
+        :param attention_mask: Mask to avoid performing attention on padding token indices. Always equals 1.
+        :param comment_len: Tensor representing the length of comments. Equal 1 if comment has less than 3 words,
+                            0 otherwise.
+        :return: class
+        """
+        logits = self.forward(input_ids=input_ids, attention_mask=attention_mask, comment_len=comment_len)
+        return self.DOME_CLASS_NAMES[int(torch.argmax(logits, 1))]
+def dome_preprocess(tokenizer, comment):
+    """
+    DOME preprocessor - returns all required values for "DOMEBertForClassification.forward".
+    This function limits maximum number of tokens to fit into BERT.
+    :param tokenizer: tokenizer to use.
+    :param comment: text of sentence from docstring/comment that should be classified by DOMEBertForClassification.
+    :return: tuple with (input_ids, attention_mask, comment_len).
+    """
+    input_ids = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + tokenizer.tokenize(comment) +
+                                                [tokenizer.sep_token])[:MAX_LENGTH_BERT]
+    attention_mask = [1] * len(input_ids)
+    if len(comment.strip().split()) < 3:
+        comment_len = 1
+    else:
+        comment_len = 0
+    return (torch.tensor(input_ids).unsqueeze(0), torch.tensor(attention_mask).unsqueeze(0),
+            torch.tensor(comment_len).unsqueeze(0))
+class Docstring2Sentences:
+    """Helper class to split docstrings into sentences"""
+    def __init__(self):
+        self.spacy_nlp = spacy.load("en_core_web_sm")
+    @staticmethod
+    def split_docstring(docstring: str, delimiters: List[Tuple[str, str]]):
+        """
+        Splits the docstring into separate parts of text and code blocks, preserving the original formatting.
+        :param docstring: The docstring to split.
+        :param delimiters: A list of tuples, each containing start and end delimiters for code blocks.
+        :return: A list of strings, each either a text block or a code block.
+        """
+        # Escape delimiter parts for regex and create a combined pattern
+        escaped_delimiters = [tuple(map(re.escape, d)) for d in delimiters]
+        combined_pattern = '|'.join([f'({start}.*?{end})' for start, end in escaped_delimiters])
+        # Split using the combined pattern, preserving the delimiters
+        parts = re.split(combined_pattern, docstring, flags=re.DOTALL)
+        # Filter out empty strings
+        parts = [part for part in parts if part]
+        return parts
+    @staticmethod
+    def is_only_spaces_and_newlines(string):
+        """
+        Check if the given string contains only spaces and newlines.
+        :param string: The string to check.
+        :return: True if the string contains only spaces and newlines, False otherwise.
+        """
+        return bool(re.match(r'^[\s\n]+$', string))
+    def docstring2sentences(self, docstring):
+        """
+        Splits a docstring into individual sentences, preserving code blocks.
+        This function uses `docstring2parts` to split the docstring into parts based on predefined code block delimiters.
+        It then utilizes a SpaCy NLP model to split the non-code text parts into sentences.
+        Code blocks are kept intact as single elements.
+        :param docstring: The docstring to be processed, which may contain both regular text and code blocks.
+        :return: A list containing individual sentences and intact code blocks.
+        """
+        delimiters = [("@code", "@endcode"), ("\code", "\endcode")]
+        parts = self.split_docstring(docstring=docstring, delimiters=delimiters)
+        sentences = []
+        for part in parts:
+            if part[1:5] == "code" and part[-7:] == "endcode":
+                # code block
+                sentences.append(part)
+            else:
+                sentences.extend(sentence.text for sentence in self.spacy_nlp(part).sents)
+        return [sentence for sentence in sentences if not self.is_only_spaces_and_newlines(sentence)]
+````

config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "architectures": [
+    "DOMEBertForClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.9.2",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f045aba0a263d2bc109af9476b5673fb666b2e716de91698a6b639505668cb19
+size 499457001

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "special_tokens_map_file": "/home/egor/workdir/github/ICSE-DOME/DOME/src/comment_classifier/pretrained_codebert/special_tokens_map.json", "name_or_path": "/home/egor/workdir/github/ICSE-DOME/DOME/src/comment_classifier/pretrained_codebert", "tokenizer_class": "RobertaTokenizer"}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff