PeteBleackley commited on
Commit
901446a
·
verified ·
1 Parent(s): e488867

End of training

Browse files
ConSec.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections.abc import Generator, Iterable
2
+ from dataclasses import dataclass
3
+ from enum import StrEnum
4
+
5
+ from nltk.tokenize import TreebankWordDetokenizer
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ from transformers import (
10
+ AutoConfig,
11
+ AutoModel,
12
+ BatchEncoding,
13
+ DebertaV2Model,
14
+ PreTrainedConfig,
15
+ PreTrainedModel,
16
+ PreTrainedTokenizer,
17
+ )
18
+ from transformers.modeling_outputs import TokenClassifierOutput
19
+
20
+ class ModelURI(StrEnum):
21
+ BASE = "microsoft/deberta-v3-base"
22
+ LARGE = "microsoft/deberta-v3-large"
23
+
24
+ class ConSec(PreTrainedModel):
25
+ def __init__(self, config: PreTrainedConfig):
26
+ super().__init__(config)
27
+ if config.init_basemodel:
28
+ self.BaseModel = AutoModel.from_pretrained(config.name_or_path,
29
+ device_map="auto",
30
+ dtype=torch.bfloat16)
31
+ self.config.vocab_size += 2
32
+ self.BaseModel.resize_token_embeddings(self.config.vocab_size)
33
+ else:
34
+ self.BaseModel = DebertaV2Model(config)
35
+ config.init_basemodel = False
36
+
37
+ self.loss = nn.CrossEntropyLoss()
38
+ self.post_init()
39
+
40
+ @classmethod
41
+ def from_base(cls, base_id: ModelURI):
42
+ config = AutoConfig.from_pretrained(base_id)
43
+ config.init_basemodel = True
44
+ return cls(config)
45
+
46
+ def add_special_tokens(self, start: int, end: int, gloss: int):
47
+ self.config.start_token = start
48
+ self.config.end_token = end
49
+ self.config.gloss_token = gloss
50
+
51
+ def forward(self,
52
+ input_ids: torch.Tensor | None = None,
53
+ attention_mask: torch.Tensor | None = None,
54
+ token_type_ids: torch.Tensor | None = None,
55
+ position_ids: torch.Tensor | None = None,
56
+ inputs_embeds: torch.Tensor | None = None,
57
+ labels: torch.Tensor | None = None,
58
+ output_attentions: bool | None = None,
59
+ output_hidden_states: bool | None = None,
60
+ return_dict: bool | None = None,
61
+ **kwargs)->TokenClassifierOutput:
62
+ base_model_output = self.BaseModel(input_ids=input_ids,
63
+ attention_mask=attention_mask,
64
+ token_type_ids=token_type_ids,
65
+ position_ids=position_ids,
66
+ inputs_embeds=inputs_embeds,
67
+ output_attentions=output_attentions,
68
+ output_hidden_states=output_hidden_states,
69
+ **kwargs)
70
+ token_vectors = base_model_output.last_hidden_state
71
+ selection = torch.zeros_like(input_ids, dtype=token_vectors.dtype)
72
+ starts = (input_ids == self.config.start_token).nonzero()
73
+ ends = (input_ids == self.config.end_token).nonzero()
74
+ for startpos, endpos in zip(starts, ends, strict=True):
75
+ selection[startpos[0], startpos[1] : endpos[1] + 1] = 1.0
76
+ entity_vectors = torch.einsum("ijk,ij->ik", token_vectors, selection)
77
+ gloss_vectors = self.gloss_vectors(
78
+ input_ids, starts, position_ids, token_vectors
79
+ )
80
+ logits = torch.einsum("ij,ikj->ik", entity_vectors, gloss_vectors)
81
+
82
+ return TokenClassifierOutput(
83
+ logits=logits,
84
+ loss=self.loss(logits, labels) if labels is not None else None,
85
+ hidden_states=base_model_output.hidden_states if output_hidden_states else None,
86
+ attentions=base_model_output.attentions if output_attentions else None,
87
+ )
88
+
89
+ def gloss_vectors(self,input_ids: torch.Tensor,
90
+ starts: torch.Tensor,
91
+ position_ids: torch.Tensor,
92
+ token_vectors: torch.Tensor)->torch.Tensor:
93
+ with self.device:
94
+ vectors = [token_vectors[i,((position_ids[i]==position_ids[i,j])&(input_ids[i]==self.config.gloss_token))]
95
+ for (i,j) in starts]
96
+ maxlen = max(vector.shape[0] for vector in vectors)
97
+ return torch.stack([torch.cat([vector,torch.zeros((maxlen-vector.shape[0],vector.shape[1]),
98
+ dtype=torch.bfloat16)])
99
+ for vector in vectors])
100
+
101
+ def json_sequencer(sentence:list[dict])->Generator[tuple[list[str], list[str], int]]:
102
+ for site in sorted([{"span":i,
103
+ "n_candidates":len(chunk["candidates"])}
104
+ for (i,chunk) in enumerate(sentence)
105
+ if "candidates" in chunk],
106
+ key = lambda x: x["n_candidates"]):
107
+ words = [word for chunk in sentence[:site["span"]]
108
+ for word in chunk["words"]]
109
+ words.append("[START]")
110
+ words.extend(sentence[site["span"]]["words"])
111
+ words.append("[END]")
112
+ words.extend([word for chunk in sentence[site["span"]+1:]
113
+ for word in chunk["words"]])
114
+ yield (words,
115
+ sentence[site["span"]]["candidates"],
116
+ site["span"])
117
+
118
+ def json_labeller(sentence,tags):
119
+ for tag in tags:
120
+ sentence[tag["index"]]["label"]=tag["label"]
121
+ return sentence
122
+
123
+ class ConSecTagger:
124
+ def __init__(self,model,
125
+ tokenizer,
126
+ ontology,
127
+ sequencer=json_sequencer,
128
+ labeller=json_labeller):
129
+ self.model = model
130
+ self.tokenizer = tokenizer
131
+ special_tokens = self.tokenizer.get_added_tokens()
132
+ self.start_token = special_tokens["[START]"]
133
+ self.gloss_token = special_tokens["[GLOSS]"]
134
+ self.sequencer = sequencer
135
+ self.detokenizer = TreebankWordDetokenizer()
136
+ self.glosses = {synset.concept:synset.definition
137
+ for synset in ontology}
138
+ self.label=labeller
139
+
140
+
141
+ def __call__(self,sentence):
142
+ already_tagged = []
143
+ for (words,candidates,index) in self.sequencer(sentence):
144
+ text = self.detokenizer.detokenize(words)
145
+ glosses = ['']
146
+ glosses.extend([self.glosses[candidate] for candidate in candidates])
147
+ glosses.extend([self.glosses[previous["label"]] for previous in already_tagged])
148
+ with self.model.device:
149
+ tokens = self.tokenizer(text,"[GLOSS] ".join(glosses),
150
+ return_tensors="pt")
151
+ length = tokens.input_ids.shape[0]
152
+ positions = torch.arange(length)
153
+ place = (tokens.input_ids==self.start_token).nonzero(as_tuple=True)[0]
154
+ wordpos = tokens.token_to_word(place+1)
155
+ gloss_positions = list((tokens.input_ids==self.gloss_token).nonzero(as_tuple=True))
156
+ gloss_positions.append(length)
157
+ n_candidates = len(candidates)
158
+ for (i,position) in enumerate(gloss_positions[:-1]):
159
+ if i<n_candidates:
160
+ end = place + gloss_positions[i+1]-position
161
+ positions [position:gloss_positions[i+1]] = torch.arange(place,end)
162
+ else:
163
+ known = already_tagged[i-n_candidates]
164
+ start = tokens.word_to_tokens(known["place"]).start
165
+ end = start + gloss_positions[i+1] - position
166
+ positions[position:gloss_positions[i+1]] = torch.arange(start,end)
167
+ prediction = self.model(token_ids=tokens.token_ids,
168
+ attention_mask=tokens.attention_mask,
169
+ token_type_ids=tokens.token_type_ids,
170
+ position_ids=positions)
171
+ label = candidates[prediction.logits.argmax()]
172
+ already_tagged.append({"label":label,
173
+ "place":wordpos,
174
+ "index":index})
175
+ return(self.label(sentence,already_tagged))
176
+
DisamBertSingleSense.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections.abc import Generator, Iterable
2
+ from dataclasses import dataclass
3
+ from enum import StrEnum
4
+
5
+ import pprint
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ from transformers import (
10
+ AutoConfig,
11
+ AutoModel,
12
+ BatchEncoding,
13
+ ModernBertModel,
14
+ PreTrainedConfig,
15
+ PreTrainedModel,
16
+ PreTrainedTokenizer,
17
+ )
18
+ from transformers.modeling_outputs import TokenClassifierOutput
19
+
20
+ BATCH_SIZE = 16
21
+
22
+
23
+ class ModelURI(StrEnum):
24
+ BASE = "answerdotai/ModernBERT-base"
25
+ LARGE = "answerdotai/ModernBERT-large"
26
+
27
+
28
+ @dataclass(slots=True, frozen=True)
29
+ class LexicalExample:
30
+ concept: str
31
+ definition: str
32
+
33
+
34
+ @dataclass(slots=True, frozen=True)
35
+ class PaddedBatch:
36
+ input_ids: torch.Tensor
37
+ attention_mask: torch.Tensor
38
+
39
+
40
+ class DisamBertSingleSense(PreTrainedModel):
41
+ def __init__(self, config: PreTrainedConfig):
42
+ super().__init__(config)
43
+ if config.init_basemodel:
44
+ self.BaseModel = AutoModel.from_pretrained(config.name_or_path,
45
+ device_map="auto")
46
+ self.config.vocab_size += 2
47
+ self.BaseModel.resize_token_embeddings(self.config.vocab_size)
48
+ else:
49
+ self.BaseModel = ModernBertModel(config)
50
+ config.init_basemodel = False
51
+
52
+ self.loss = nn.CrossEntropyLoss()
53
+ self.post_init()
54
+
55
+ @classmethod
56
+ def from_base(cls, base_id: ModelURI):
57
+ config = AutoConfig.from_pretrained(base_id)
58
+ config.init_basemodel = True
59
+ return cls(config)
60
+
61
+ def add_special_tokens(self, start: int, end: int):
62
+ self.config.start_token = start
63
+ self.config.end_token = end
64
+
65
+ def forward(
66
+ self,
67
+ input_ids: torch.Tensor,
68
+ attention_mask: torch.Tensor,
69
+ candidate_tokens: torch.Tensor,
70
+ candidate_attention_masks: torch.Tensor,
71
+ candidate_mapping: torch.Tensor,
72
+ labels: Iterable[int] | None = None,
73
+ output_hidden_states: bool = False,
74
+ output_attentions: bool = False,
75
+ ) -> TokenClassifierOutput:
76
+ base_model_output = self.BaseModel(
77
+ input_ids,
78
+ attention_mask,
79
+ output_hidden_states=output_hidden_states,
80
+ output_attentions=output_attentions,
81
+ )
82
+ token_vectors = base_model_output.last_hidden_state
83
+ selection = torch.zeros_like(input_ids, dtype=token_vectors.dtype)
84
+ starts = (input_ids == self.config.start_token).nonzero()
85
+ ends = (input_ids == self.config.end_token).nonzero()
86
+ for startpos, endpos in zip(starts, ends, strict=True):
87
+ selection[startpos[0], startpos[1] : endpos[1] + 1] = 1.0
88
+ entity_vectors = torch.einsum("ijk,ij->ik", token_vectors, selection)
89
+ gloss_vectors = self.gloss_vectors(
90
+ candidate_tokens, candidate_attention_masks, candidate_mapping
91
+ )
92
+ logits = torch.einsum("ij,ikj->ik", entity_vectors, gloss_vectors)
93
+
94
+ return TokenClassifierOutput(
95
+ logits=logits,
96
+ loss=self.loss(logits, labels) if labels is not None else None,
97
+ hidden_states=base_model_output.hidden_states if output_hidden_states else None,
98
+ attentions=base_model_output.attentions if output_attentions else None,
99
+ )
100
+
101
+ def gloss_vectors(self, candidates, candidate_attention_masks, candidate_mapping):
102
+ with self.device:
103
+ vectors = self.BaseModel(candidates, candidate_attention_masks).last_hidden_state[:, 0]
104
+ chunks = [
105
+ torch.squeeze(vectors[(candidate_mapping == sentence_index).nonzero()], dim=1)
106
+ for sentence_index in torch.unique(candidate_mapping)
107
+ ]
108
+ maxlen = max(chunk.shape[0] for chunk in chunks)
109
+ return torch.stack(
110
+ [
111
+ torch.cat(
112
+ [chunk, torch.zeros((maxlen - chunk.shape[0], self.config.hidden_size))]
113
+ )
114
+ for chunk in chunks
115
+ ]
116
+ )
117
+
118
+
119
+ class CandidateLabeller:
120
+ def __init__(
121
+ self,
122
+ tokenizer: PreTrainedTokenizer,
123
+ ontology: Generator[LexicalExample],
124
+ device: torch.device,
125
+ retain_candidates: bool = False,
126
+ ):
127
+ self.tokenizer = tokenizer
128
+ self.device = device
129
+ self.gloss_tokens = {
130
+ example.concept: self.tokenizer(example.definition, padding=True)
131
+ for example in ontology
132
+ }
133
+ self.retain_candidates = retain_candidates
134
+
135
+ def __call__(self, batch: dict) -> dict:
136
+ with self.device:
137
+ encoded = [
138
+ BatchEncoding(
139
+ {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]}
140
+ )
141
+ for example in batch
142
+ ]
143
+ tokens = self.tokenizer.pad(encoded, padding=True, return_tensors="pt")
144
+ candidate_tokens = self.tokenizer.pad(
145
+ [
146
+ self.gloss_tokens[concept]
147
+ for example in batch
148
+ for concept in example["candidates"]
149
+ ],
150
+ padding=True,
151
+ return_attention_mask=True,
152
+ return_tensors="pt",
153
+ )
154
+ result = {
155
+ "input_ids": tokens.input_ids,
156
+ "attention_mask": tokens.attention_mask,
157
+ "candidate_tokens": candidate_tokens.input_ids,
158
+ "candidate_attention_masks": candidate_tokens.attention_mask,
159
+ "candidate_mapping": torch.cat(
160
+ [
161
+ torch.tensor([i] * len(example["candidates"]))
162
+ for (i, example) in enumerate(batch)
163
+ ]
164
+ ),
165
+ }
166
+ if "label" in batch[0]:
167
+ result["labels"] = torch.tensor(
168
+ [example["candidates"].index(example["label"]) for example in batch]
169
+ )
170
+ if self.retain_candidates:
171
+ result["candidates"] = [example["candidates"] for example in batch]
172
+ return result
README.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ language:
4
+ - en
5
+ license: mit
6
+ base_model: microsoft/deberta-v3-base
7
+ tags:
8
+ - generated_from_trainer
9
+ metrics:
10
+ - precision
11
+ - recall
12
+ - f1
13
+ model-index:
14
+ - name: ConSec
15
+ results: []
16
+ ---
17
+
18
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
19
+ should probably proofread and complete it, then remove this comment. -->
20
+
21
+ # ConSec
22
+
23
+ This model is a fine-tuned version of [microsoft/deberta-v3-base](https://huggingface.co/microsoft/deberta-v3-base) on the None dataset.
24
+ It achieves the following results on the evaluation set:
25
+ - Loss: 3.9109
26
+ - Precision: 0.5353
27
+ - Recall: 0.5517
28
+ - F1: 0.5434
29
+ - Matthews: 0.5509
30
+
31
+ ## Model description
32
+
33
+ More information needed
34
+
35
+ ## Intended uses & limitations
36
+
37
+ More information needed
38
+
39
+ ## Training and evaluation data
40
+
41
+ More information needed
42
+
43
+ ## Training procedure
44
+
45
+ ### Training hyperparameters
46
+
47
+ The following hyperparameters were used during training:
48
+ - learning_rate: 0.0001
49
+ - train_batch_size: 4
50
+ - eval_batch_size: 4
51
+ - seed: 42
52
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
53
+ - lr_scheduler_type: inverse_sqrt
54
+ - lr_scheduler_warmup_steps: 1000
55
+ - num_epochs: 5
56
+
57
+ ### Training results
58
+
59
+ | Training Loss | Epoch | Step | Validation Loss | Precision | Recall | F1 | Matthews |
60
+ |:-------------:|:-----:|:------:|:---------------:|:---------:|:------:|:------:|:--------:|
61
+ | No log | 0 | 0 | 365.7523 | 0.4494 | 0.3098 | 0.3668 | 0.3092 |
62
+ | 4.1345 | 1.0 | 56179 | 3.8725 | 0.5383 | 0.5535 | 0.5458 | 0.5527 |
63
+ | 4.1174 | 2.0 | 112358 | 3.9544 | 0.5360 | 0.5517 | 0.5437 | 0.5509 |
64
+ | 3.9667 | 3.0 | 168537 | 3.9244 | 0.5348 | 0.5517 | 0.5431 | 0.5509 |
65
+ | 4.2556 | 4.0 | 224716 | 3.9431 | 0.5348 | 0.5508 | 0.5427 | 0.5501 |
66
+ | 4.2113 | 5.0 | 280895 | 3.9109 | 0.5353 | 0.5517 | 0.5434 | 0.5509 |
67
+
68
+
69
+ ### Framework versions
70
+
71
+ - Transformers 5.3.0
72
+ - Pytorch 2.10.0+cu128
73
+ - Datasets 4.5.0
74
+ - Tokenizers 0.22.2
config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ConSec"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "auto_map": {
7
+ "AutoModel": "ConSec.ConSec"
8
+ },
9
+ "bos_token_id": 1,
10
+ "dtype": "bfloat16",
11
+ "end_token": 128002,
12
+ "eos_token_id": 2,
13
+ "gloss_token": 128003,
14
+ "hidden_act": "gelu",
15
+ "hidden_dropout_prob": 0.1,
16
+ "hidden_size": 768,
17
+ "init_basemodel": false,
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "layer_norm_eps": 1e-07,
21
+ "legacy": true,
22
+ "max_position_embeddings": 512,
23
+ "max_relative_positions": -1,
24
+ "model_type": "deberta-v2",
25
+ "norm_rel_ebd": "layer_norm",
26
+ "num_attention_heads": 12,
27
+ "num_hidden_layers": 12,
28
+ "pad_token_id": 0,
29
+ "pooler_dropout": 0,
30
+ "pooler_hidden_act": "gelu",
31
+ "pooler_hidden_size": 768,
32
+ "pos_att_type": [
33
+ "p2c",
34
+ "c2p"
35
+ ],
36
+ "position_biased_input": false,
37
+ "position_buckets": 256,
38
+ "relative_attention": true,
39
+ "share_att_key": true,
40
+ "start_token": 128001,
41
+ "tie_word_embeddings": true,
42
+ "transformers_version": "5.3.0",
43
+ "type_vocab_size": 0,
44
+ "use_cache": false,
45
+ "vocab_size": 128102
46
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51944abe32d6d50692f53f7ff5146b58b23aa7e5a015d85ea6b5c3cc14a5fb54
3
+ size 367690992
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "backend": "tokenizers",
4
+ "bos_token": "[CLS]",
5
+ "cls_token": "[CLS]",
6
+ "do_lower_case": false,
7
+ "eos_token": "[SEP]",
8
+ "extra_special_tokens": [
9
+ "[START]",
10
+ "[END]",
11
+ "[GLOSS]"
12
+ ],
13
+ "is_local": false,
14
+ "mask_token": "[MASK]",
15
+ "model_max_length": 1000000000000000019884624838656,
16
+ "pad_token": "[PAD]",
17
+ "sep_token": "[SEP]",
18
+ "split_by_punct": false,
19
+ "tokenizer_class": "DebertaV2Tokenizer",
20
+ "unk_id": 3,
21
+ "unk_token": "[UNK]",
22
+ "vocab_type": "spm"
23
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ac6f94a73fd655333f073ab7b1deec6fa994eee153b84935c33573c818f37a0
3
+ size 5265