Initial commit of the trained NER model with code

Browse files

Files changed (7) hide show

config.json +25 -0
model.safetensors +3 -0
models.py +128 -0
special_tokens_map.json +37 -0
tokenizer.json +0 -0
tokenizer_config.json +58 -0
vocab.txt +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_name_or_path": "../experiments_final/model_dbmdz_bert_medium_historic_multilingual_cased_max_sequence_length_512_epochs_5_run_extended_suffix_baseline/checkpoint-450",
+  "architectures": [
+    "ExtendedMultitaskModelForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 8,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.0.dev0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 32000
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03a807b124debff782406c816eacb7ced1f2e25b9a5198b27e1616a41faa0662
+size 193971960

models.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from transformers.modeling_outputs import TokenClassifierOutput
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel, AutoModel, AutoConfig
+from torch.nn import CrossEntropyLoss
+from typing import Optional, Tuple, Union
+import logging
+logger = logging.getLogger(__name__)
+class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
+    config_class = AutoConfig
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def __init__(self, config, num_token_labels_dict):
+        super().__init__(config)
+        self.num_token_labels_dict = num_token_labels_dict
+        self.config = config
+        # self.bert = AutoModel.from_config(config)
+        self.bert = AutoModel.from_pretrained(config.name_or_path, config=config)
+        if "classifier_dropout" not in config.__dict__:
+            classifier_dropout = 0.1
+        else:
+            classifier_dropout = (
+                config.classifier_dropout
+                if config.classifier_dropout is not None
+                else config.hidden_dropout_prob
+            )
+        self.dropout = nn.Dropout(classifier_dropout)
+        # Additional transformer layers
+        self.transformer_encoder = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                d_model=config.hidden_size, nhead=config.num_attention_heads
+            ),
+            num_layers=2,
+        )
+        # For token classification, create a classifier for each task
+        self.token_classifiers = nn.ModuleDict(
+            {
+                task: nn.Linear(config.hidden_size, num_labels)
+                for task, num_labels in num_token_labels_dict.items()
+            }
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        token_labels: Optional[dict] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        token_labels (`dict` of `torch.LongTensor` of shape `(batch_size, seq_length)`, *optional*):
+            Labels for computing the token classification loss. Keys should match the tasks.
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        bert_kwargs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+            "position_ids": position_ids,
+            "head_mask": head_mask,
+            "inputs_embeds": inputs_embeds,
+            "output_attentions": output_attentions,
+            "output_hidden_states": output_hidden_states,
+            "return_dict": return_dict,
+        }
+        if any(
+            keyword in self.config.name_or_path.lower()
+            for keyword in ["llama", "deberta"]
+        ):
+            bert_kwargs.pop("token_type_ids")
+            bert_kwargs.pop("head_mask")
+        outputs = self.bert(**bert_kwargs)
+        # For token classification
+        token_output = outputs[0]
+        token_output = self.dropout(token_output)
+        # Pass through additional transformer layers
+        token_output = self.transformer_encoder(token_output.transpose(0, 1)).transpose(
+            0, 1
+        )
+        # Collect the logits and compute the loss for each task
+        task_logits = {}
+        total_loss = 0
+        for task, classifier in self.token_classifiers.items():
+            logits = classifier(token_output)
+            task_logits[task] = logits
+            if token_labels and task in token_labels:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_token_labels_dict[task]),
+                    token_labels[task].view(-1),
+                )
+                total_loss += loss
+        if not return_dict:
+            output = (task_logits,) + outputs[2:]
+            return ((total_loss,) + output) if total_loss != 0 else output
+        return TokenClassifierOutput(
+            loss=total_loss,
+            logits=task_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "max_len": 512,
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": false,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff