Upload folder using huggingface_hub
Browse files- README.md +126 -0
- __init__.py +0 -0
- config.json +25 -0
- configuration_ltgbert.py +34 -0
- modeling_ltgbert.py +640 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +1 -0
- tokenizer.json +0 -0
- tokenizer_config.json +10 -0
README.md
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- is
|
4 |
+
inference: false
|
5 |
+
tags:
|
6 |
+
- BERT
|
7 |
+
- HPLT
|
8 |
+
- encoder
|
9 |
+
license: apache-2.0
|
10 |
+
datasets:
|
11 |
+
- HPLT/HPLT2.0_cleaned
|
12 |
+
---
|
13 |
+
|
14 |
+
# HPLT Bert for Icelandic
|
15 |
+
|
16 |
+
<img src="https://hplt-project.org/_next/static/media/logo-hplt.d5e16ca5.svg" width=12.5%>
|
17 |
+
|
18 |
+
This is one of the encoder-only monolingual language models trained as a second release by the [HPLT project](https://hplt-project.org/).
|
19 |
+
It is a so called masked language model. In particular, we used the modification of the classic BERT model named [LTG-BERT](https://aclanthology.org/2023.findings-eacl.146/).
|
20 |
+
|
21 |
+
A monolingual LTG-BERT model is trained for some languages in the [HPLT 2.0 data release](https://hplt-project.org/datasets/v2.0).
|
22 |
+
|
23 |
+
All the HPLT encoder-only models use the same hyper-parameters, roughly following the BERT-base setup:
|
24 |
+
- hidden size: 768
|
25 |
+
- attention heads: 12
|
26 |
+
- layers: 12
|
27 |
+
- vocabulary size: 32768
|
28 |
+
|
29 |
+
Every model uses its own tokenizer trained on language-specific HPLT data.
|
30 |
+
|
31 |
+
[The training code](https://github.com/hplt-project/HPLT-WP4).
|
32 |
+
|
33 |
+
[The training statistics of all runs](https://api.wandb.ai/links/ltg/kduj7mjn)
|
34 |
+
|
35 |
+
## Example usage
|
36 |
+
|
37 |
+
This model currently needs a custom wrapper from `modeling_ltgbert.py`, you should therefore load the model with `trust_remote_code=True`.
|
38 |
+
|
39 |
+
```python
|
40 |
+
import torch
|
41 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
42 |
+
|
43 |
+
tokenizer = AutoTokenizer.from_pretrained("HPLT/hplt_bert_base_isl-Latn")
|
44 |
+
model = AutoModelForMaskedLM.from_pretrained("HPLT/hplt_bert_base_isl-Latn", trust_remote_code=True)
|
45 |
+
|
46 |
+
mask_id = tokenizer.convert_tokens_to_ids("[MASK]")
|
47 |
+
input_text = tokenizer("It's a beautiful[MASK].", return_tensors="pt")
|
48 |
+
output_p = model(**input_text)
|
49 |
+
output_text = torch.where(input_text.input_ids == mask_id, output_p.logits.argmax(-1), input_text.input_ids)
|
50 |
+
|
51 |
+
# should output: '[CLS] It's a beautiful place.[SEP]'
|
52 |
+
print(tokenizer.decode(output_text[0].tolist()))
|
53 |
+
```
|
54 |
+
|
55 |
+
The following classes are currently implemented: `AutoModel`, `AutoModelMaskedLM`, `AutoModelForSequenceClassification`, `AutoModelForTokenClassification`, `AutoModelForQuestionAnswering` and `AutoModeltForMultipleChoice`.
|
56 |
+
|
57 |
+
## Intermediate checkpoints
|
58 |
+
|
59 |
+
We are releasing 10 intermediate checkpoints for each model at intervals of every 3125 training steps in separate branches. The naming convention is `stepXXX`: for example, `step18750`.
|
60 |
+
|
61 |
+
You can load a specific model revision with `transformers` using the argument `revision`:
|
62 |
+
```python
|
63 |
+
model = AutoModelForMaskedLM.from_pretrained("HPLT/hplt_bert_base_isl-Latn", revision="step21875", trust_remote_code=True)
|
64 |
+
```
|
65 |
+
|
66 |
+
You can access all the revisions for the models with the following code:
|
67 |
+
```python
|
68 |
+
from huggingface_hub import list_repo_refs
|
69 |
+
out = list_repo_refs("HPLT/hplt_bert_base_isl-Latn")
|
70 |
+
print([b.name for b in out.branches])
|
71 |
+
```
|
72 |
+
|
73 |
+
## Cite us
|
74 |
+
|
75 |
+
```bibtex
|
76 |
+
@inproceedings{samuel-etal-2023-trained,
|
77 |
+
title = "Trained on 100 million words and still in shape: {BERT} meets {B}ritish {N}ational {C}orpus",
|
78 |
+
author = "Samuel, David and
|
79 |
+
Kutuzov, Andrey and
|
80 |
+
{\O}vrelid, Lilja and
|
81 |
+
Velldal, Erik",
|
82 |
+
editor = "Vlachos, Andreas and
|
83 |
+
Augenstein, Isabelle",
|
84 |
+
booktitle = "Findings of the Association for Computational Linguistics: EACL 2023",
|
85 |
+
month = may,
|
86 |
+
year = "2023",
|
87 |
+
address = "Dubrovnik, Croatia",
|
88 |
+
publisher = "Association for Computational Linguistics",
|
89 |
+
url = "https://aclanthology.org/2023.findings-eacl.146",
|
90 |
+
doi = "10.18653/v1/2023.findings-eacl.146",
|
91 |
+
pages = "1954--1974"
|
92 |
+
})
|
93 |
+
```
|
94 |
+
|
95 |
+
```bibtex
|
96 |
+
@inproceedings{de-gibert-etal-2024-new-massive,
|
97 |
+
title = "A New Massive Multilingual Dataset for High-Performance Language Technologies",
|
98 |
+
author = {de Gibert, Ona and
|
99 |
+
Nail, Graeme and
|
100 |
+
Arefyev, Nikolay and
|
101 |
+
Ba{\~n}{\'o}n, Marta and
|
102 |
+
van der Linde, Jelmer and
|
103 |
+
Ji, Shaoxiong and
|
104 |
+
Zaragoza-Bernabeu, Jaume and
|
105 |
+
Aulamo, Mikko and
|
106 |
+
Ram{\'\i}rez-S{\'a}nchez, Gema and
|
107 |
+
Kutuzov, Andrey and
|
108 |
+
Pyysalo, Sampo and
|
109 |
+
Oepen, Stephan and
|
110 |
+
Tiedemann, J{\"o}rg},
|
111 |
+
editor = "Calzolari, Nicoletta and
|
112 |
+
Kan, Min-Yen and
|
113 |
+
Hoste, Veronique and
|
114 |
+
Lenci, Alessandro and
|
115 |
+
Sakti, Sakriani and
|
116 |
+
Xue, Nianwen",
|
117 |
+
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
|
118 |
+
month = may,
|
119 |
+
year = "2024",
|
120 |
+
address = "Torino, Italia",
|
121 |
+
publisher = "ELRA and ICCL",
|
122 |
+
url = "https://aclanthology.org/2024.lrec-main.100",
|
123 |
+
pages = "1116--1128",
|
124 |
+
abstract = "We present the HPLT (High Performance Language Technologies) language resources, a new massive multilingual dataset including both monolingual and bilingual corpora extracted from CommonCrawl and previously unused web crawls from the Internet Archive. We describe our methods for data acquisition, management and processing of large corpora, which rely on open-source software tools and high-performance computing. Our monolingual collection focuses on low- to medium-resourced languages and covers 75 languages and a total of {\mbox{$\approx$}} 5.6 trillion word tokens de-duplicated on the document level. Our Icelandic-centric parallel corpus is derived from its monolingual counterpart and covers 18 language pairs and more than 96 million aligned sentence pairs with roughly 1.4 billion Icelandic tokens. The HPLT language resources are one of the largest open text corpora ever released, providing a great resource for language modeling and machine translation training. We publicly release the corpora, the software, and the tools used in this work.",
|
125 |
+
}
|
126 |
+
```
|
__init__.py
ADDED
File without changes
|
config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LtgbertForMaskedLM"
|
4 |
+
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoConfig": "configuration_ltgbert.LtgbertConfig",
|
7 |
+
"AutoModel": "modeling_ltgbert.LtgbertModel",
|
8 |
+
"AutoModelForMaskedLM": "modeling_ltgbert.LtgbertForMaskedLM",
|
9 |
+
"AutoModelForSequenceClassification": "modeling_ltgbert.LtgbertForSequenceClassification",
|
10 |
+
"AutoModelForTokenClassification": "modeling_ltgbert.LtgbertForTokenClassification",
|
11 |
+
"AutoModelForQuestionAnswering": "modeling_ltgbert.LtgbertForQuestionAnswering",
|
12 |
+
"AutoModelForMultipleChoice": "modeling_ltgbert.LtgbertForMultipleChoice"
|
13 |
+
},
|
14 |
+
"attention_probs_dropout_prob": 0.1,
|
15 |
+
"hidden_dropout_prob": 0.1,
|
16 |
+
"hidden_size": 768,
|
17 |
+
"intermediate_size": 2560,
|
18 |
+
"layer_norm_eps": 1e-07,
|
19 |
+
"max_position_embeddings": 512,
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"position_bucket_size": 32,
|
23 |
+
"torch_dtype": "float32",
|
24 |
+
"vocab_size": 32768
|
25 |
+
}
|
configuration_ltgbert.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers.configuration_utils import PretrainedConfig
|
2 |
+
|
3 |
+
|
4 |
+
class LtgbertConfig(PretrainedConfig):
|
5 |
+
"""Configuration class to store the configuration of a `LtgbertModel`.
|
6 |
+
"""
|
7 |
+
def __init__(
|
8 |
+
self,
|
9 |
+
vocab_size=32768,
|
10 |
+
attention_probs_dropout_prob=0.1,
|
11 |
+
hidden_dropout_prob=0.1,
|
12 |
+
hidden_size=768,
|
13 |
+
intermediate_size=2048,
|
14 |
+
max_position_embeddings=512,
|
15 |
+
position_bucket_size=32,
|
16 |
+
num_attention_heads=12,
|
17 |
+
num_hidden_layers=12,
|
18 |
+
layer_norm_eps=1.0e-7,
|
19 |
+
output_all_encoded_layers=True,
|
20 |
+
**kwargs,
|
21 |
+
):
|
22 |
+
super().__init__(**kwargs)
|
23 |
+
|
24 |
+
self.vocab_size = vocab_size
|
25 |
+
self.hidden_size = hidden_size
|
26 |
+
self.num_hidden_layers = num_hidden_layers
|
27 |
+
self.num_attention_heads = num_attention_heads
|
28 |
+
self.intermediate_size = intermediate_size
|
29 |
+
self.hidden_dropout_prob = hidden_dropout_prob
|
30 |
+
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
31 |
+
self.max_position_embeddings = max_position_embeddings
|
32 |
+
self.output_all_encoded_layers = output_all_encoded_layers
|
33 |
+
self.position_bucket_size = position_bucket_size
|
34 |
+
self.layer_norm_eps = layer_norm_eps
|
modeling_ltgbert.py
ADDED
@@ -0,0 +1,640 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
from typing import List, Optional, Tuple, Union
|
3 |
+
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
import torch.nn.functional as F
|
7 |
+
from torch.utils import checkpoint
|
8 |
+
|
9 |
+
from .configuration_ltgbert import LtgbertConfig
|
10 |
+
from transformers.modeling_utils import PreTrainedModel
|
11 |
+
from transformers.activations import gelu_new
|
12 |
+
from transformers.modeling_outputs import (
|
13 |
+
MaskedLMOutput,
|
14 |
+
MultipleChoiceModelOutput,
|
15 |
+
QuestionAnsweringModelOutput,
|
16 |
+
SequenceClassifierOutput,
|
17 |
+
TokenClassifierOutput,
|
18 |
+
BaseModelOutput
|
19 |
+
)
|
20 |
+
from transformers.pytorch_utils import softmax_backward_data
|
21 |
+
|
22 |
+
|
23 |
+
class Encoder(nn.Module):
|
24 |
+
def __init__(self, config, activation_checkpointing=False):
|
25 |
+
super().__init__()
|
26 |
+
self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.num_hidden_layers)])
|
27 |
+
|
28 |
+
for i, layer in enumerate(self.layers):
|
29 |
+
layer.mlp.mlp[1].weight.data *= math.sqrt(1.0 / (2.0 * (1 + i)))
|
30 |
+
layer.mlp.mlp[-2].weight.data *= math.sqrt(1.0 / (2.0 * (1 + i)))
|
31 |
+
|
32 |
+
self.activation_checkpointing = activation_checkpointing
|
33 |
+
|
34 |
+
def forward(self, hidden_states, attention_mask, relative_embedding):
|
35 |
+
hidden_states, attention_probs = [hidden_states], []
|
36 |
+
|
37 |
+
for layer in self.layers:
|
38 |
+
if self.activation_checkpointing:
|
39 |
+
hidden_state, attention_p = checkpoint.checkpoint(layer, hidden_states[-1], attention_mask, relative_embedding)
|
40 |
+
else:
|
41 |
+
hidden_state, attention_p = layer(hidden_states[-1], attention_mask, relative_embedding)
|
42 |
+
|
43 |
+
hidden_states.append(hidden_state)
|
44 |
+
attention_probs.append(attention_p)
|
45 |
+
|
46 |
+
return hidden_states, attention_probs
|
47 |
+
|
48 |
+
|
49 |
+
class MaskClassifier(nn.Module):
|
50 |
+
def __init__(self, config, subword_embedding):
|
51 |
+
super().__init__()
|
52 |
+
self.nonlinearity = nn.Sequential(
|
53 |
+
nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False),
|
54 |
+
nn.Linear(config.hidden_size, config.hidden_size),
|
55 |
+
nn.GELU(),
|
56 |
+
nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False),
|
57 |
+
nn.Dropout(config.hidden_dropout_prob),
|
58 |
+
nn.Linear(subword_embedding.size(1), subword_embedding.size(0))
|
59 |
+
)
|
60 |
+
|
61 |
+
def forward(self, x, masked_lm_labels=None):
|
62 |
+
if masked_lm_labels is not None:
|
63 |
+
x = torch.index_select(x.flatten(0, 1), 0, torch.nonzero(masked_lm_labels.flatten() != -100).squeeze())
|
64 |
+
x = self.nonlinearity(x)
|
65 |
+
return x
|
66 |
+
|
67 |
+
|
68 |
+
class EncoderLayer(nn.Module):
|
69 |
+
def __init__(self, config):
|
70 |
+
super().__init__()
|
71 |
+
self.attention = Attention(config)
|
72 |
+
self.mlp = FeedForward(config)
|
73 |
+
|
74 |
+
def forward(self, x, padding_mask, relative_embedding):
|
75 |
+
attention_output, attention_probs = self.attention(x, padding_mask, relative_embedding)
|
76 |
+
x = x + attention_output
|
77 |
+
x = x + self.mlp(x)
|
78 |
+
return x, attention_probs
|
79 |
+
|
80 |
+
|
81 |
+
class GeGLU(nn.Module):
|
82 |
+
def forward(self, x):
|
83 |
+
x, gate = x.chunk(2, dim=-1)
|
84 |
+
x = x * gelu_new(gate)
|
85 |
+
return x
|
86 |
+
|
87 |
+
|
88 |
+
class FeedForward(nn.Module):
|
89 |
+
def __init__(self, config):
|
90 |
+
super().__init__()
|
91 |
+
self.mlp = nn.Sequential(
|
92 |
+
nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False),
|
93 |
+
nn.Linear(config.hidden_size, 2*config.intermediate_size, bias=False),
|
94 |
+
GeGLU(),
|
95 |
+
nn.LayerNorm(config.intermediate_size, eps=config.layer_norm_eps, elementwise_affine=False),
|
96 |
+
nn.Linear(config.intermediate_size, config.hidden_size, bias=False),
|
97 |
+
nn.Dropout(config.hidden_dropout_prob)
|
98 |
+
)
|
99 |
+
|
100 |
+
def forward(self, x):
|
101 |
+
return self.mlp(x)
|
102 |
+
|
103 |
+
|
104 |
+
class MaskedSoftmax(torch.autograd.Function):
|
105 |
+
@staticmethod
|
106 |
+
def forward(self, x, mask, dim):
|
107 |
+
self.dim = dim
|
108 |
+
x.masked_fill_(mask, float('-inf'))
|
109 |
+
x = torch.softmax(x, self.dim)
|
110 |
+
x.masked_fill_(mask, 0.0)
|
111 |
+
self.save_for_backward(x)
|
112 |
+
return x
|
113 |
+
|
114 |
+
@staticmethod
|
115 |
+
def backward(self, grad_output):
|
116 |
+
output, = self.saved_tensors
|
117 |
+
input_grad = softmax_backward_data(self, grad_output, output, self.dim, output)
|
118 |
+
return input_grad, None, None
|
119 |
+
|
120 |
+
|
121 |
+
class Attention(nn.Module):
|
122 |
+
def __init__(self, config):
|
123 |
+
super().__init__()
|
124 |
+
|
125 |
+
self.config = config
|
126 |
+
|
127 |
+
if config.hidden_size % config.num_attention_heads != 0:
|
128 |
+
raise ValueError(f"The hidden size {config.hidden_size} is not a multiple of the number of attention heads {config.num_attention_heads}")
|
129 |
+
|
130 |
+
self.hidden_size = config.hidden_size
|
131 |
+
self.num_heads = config.num_attention_heads
|
132 |
+
self.head_size = config.hidden_size // config.num_attention_heads
|
133 |
+
|
134 |
+
self.in_proj_qk = nn.Linear(config.hidden_size, 2*config.hidden_size, bias=True)
|
135 |
+
self.in_proj_v = nn.Linear(config.hidden_size, config.hidden_size, bias=True)
|
136 |
+
self.out_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=True)
|
137 |
+
|
138 |
+
self.pre_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False)
|
139 |
+
self.post_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True)
|
140 |
+
|
141 |
+
position_indices = torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(1) \
|
142 |
+
- torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(0)
|
143 |
+
position_indices = self.make_log_bucket_position(position_indices, config.position_bucket_size, config.max_position_embeddings)
|
144 |
+
position_indices = config.position_bucket_size - 1 + position_indices
|
145 |
+
self.register_buffer("position_indices", position_indices, persistent=True)
|
146 |
+
|
147 |
+
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
148 |
+
self.scale = 1.0 / math.sqrt(3 * self.head_size)
|
149 |
+
|
150 |
+
def make_log_bucket_position(self, relative_pos, bucket_size, max_position):
|
151 |
+
sign = torch.sign(relative_pos)
|
152 |
+
mid = bucket_size // 2
|
153 |
+
abs_pos = torch.where((relative_pos < mid) & (relative_pos > -mid), mid - 1, torch.abs(relative_pos).clamp(max=max_position - 1))
|
154 |
+
log_pos = torch.ceil(torch.log(abs_pos / mid) / math.log((max_position-1) / mid) * (mid - 1)).int() + mid
|
155 |
+
bucket_pos = torch.where(abs_pos <= mid, relative_pos, log_pos * sign).long()
|
156 |
+
return bucket_pos
|
157 |
+
|
158 |
+
def compute_attention_scores(self, hidden_states, relative_embedding):
|
159 |
+
key_len, batch_size, _ = hidden_states.size()
|
160 |
+
query_len = key_len
|
161 |
+
|
162 |
+
if self.position_indices.size(0) < query_len:
|
163 |
+
position_indices = torch.arange(query_len, dtype=torch.long).unsqueeze(1) \
|
164 |
+
- torch.arange(query_len, dtype=torch.long).unsqueeze(0)
|
165 |
+
position_indices = self.make_log_bucket_position(position_indices, self.config.position_bucket_size, 512)
|
166 |
+
position_indices = self.config.position_bucket_size - 1 + position_indices
|
167 |
+
self.position_indices = position_indices.to(hidden_states.device)
|
168 |
+
|
169 |
+
hidden_states = self.pre_layer_norm(hidden_states)
|
170 |
+
|
171 |
+
query, key = self.in_proj_qk(hidden_states).chunk(2, dim=2) # shape: [T, B, D]
|
172 |
+
value = self.in_proj_v(hidden_states) # shape: [T, B, D]
|
173 |
+
|
174 |
+
query = query.reshape(query_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
|
175 |
+
key = key.reshape(key_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
|
176 |
+
value = value.view(key_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
|
177 |
+
|
178 |
+
attention_scores = torch.bmm(query, key.transpose(1, 2) * self.scale)
|
179 |
+
|
180 |
+
query_pos, key_pos = self.in_proj_qk(
|
181 |
+
self.dropout(relative_embedding)).chunk(2,
|
182 |
+
dim=-1) # shape: [2T-1, D]
|
183 |
+
query_pos = query_pos.view(-1, self.num_heads,
|
184 |
+
self.head_size) # shape: [2T-1, H, D]
|
185 |
+
key_pos = key_pos.view(-1, self.num_heads,
|
186 |
+
self.head_size) # shape: [2T-1, H, D]
|
187 |
+
query = query.view(batch_size, self.num_heads, query_len, self.head_size)
|
188 |
+
key = key.view(batch_size, self.num_heads, query_len, self.head_size)
|
189 |
+
|
190 |
+
attention_c_p = torch.einsum("bhqd,khd->bhqk", query, key_pos.squeeze(1) * self.scale)
|
191 |
+
attention_p_c = torch.einsum("bhkd,qhd->bhqk", key * self.scale, query_pos.squeeze(1))
|
192 |
+
|
193 |
+
position_indices = self.position_indices[:query_len, :key_len].expand(batch_size, self.num_heads, -1, -1)
|
194 |
+
attention_c_p = attention_c_p.gather(3, position_indices)
|
195 |
+
attention_p_c = attention_p_c.gather(2, position_indices)
|
196 |
+
|
197 |
+
attention_scores = attention_scores.view(batch_size, self.num_heads, query_len, key_len)
|
198 |
+
attention_scores.add_(attention_c_p)
|
199 |
+
attention_scores.add_(attention_p_c)
|
200 |
+
|
201 |
+
return attention_scores, value
|
202 |
+
|
203 |
+
def compute_output(self, attention_probs, value):
|
204 |
+
attention_probs = self.dropout(attention_probs)
|
205 |
+
context = torch.bmm(attention_probs.flatten(0, 1), value) # shape: [B*H, Q, D]
|
206 |
+
context = context.transpose(0, 1).reshape(context.size(1), -1, self.hidden_size) # shape: [Q, B, H*D]
|
207 |
+
context = self.out_proj(context)
|
208 |
+
context = self.post_layer_norm(context)
|
209 |
+
context = self.dropout(context)
|
210 |
+
return context
|
211 |
+
|
212 |
+
def forward(self, hidden_states, attention_mask, relative_embedding):
|
213 |
+
attention_scores, value = self.compute_attention_scores(hidden_states, relative_embedding)
|
214 |
+
attention_probs = MaskedSoftmax.apply(attention_scores, attention_mask, -1)
|
215 |
+
return self.compute_output(attention_probs, value), attention_probs.detach()
|
216 |
+
|
217 |
+
|
218 |
+
class Embedding(nn.Module):
|
219 |
+
def __init__(self, config):
|
220 |
+
super().__init__()
|
221 |
+
self.hidden_size = config.hidden_size
|
222 |
+
|
223 |
+
self.word_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
|
224 |
+
self.word_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
|
225 |
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
226 |
+
|
227 |
+
self.relative_embedding = nn.Parameter(torch.empty(2 * config.position_bucket_size - 1, config.hidden_size))
|
228 |
+
self.relative_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
229 |
+
|
230 |
+
def forward(self, input_ids):
|
231 |
+
word_embedding = self.dropout(self.word_layer_norm(self.word_embedding(input_ids)))
|
232 |
+
relative_embeddings = self.relative_layer_norm(self.relative_embedding)
|
233 |
+
return word_embedding, relative_embeddings
|
234 |
+
|
235 |
+
|
236 |
+
#
|
237 |
+
# HuggingFace wrappers
|
238 |
+
#
|
239 |
+
|
240 |
+
class LtgbertPreTrainedModel(PreTrainedModel):
|
241 |
+
config_class = LtgbertConfig
|
242 |
+
supports_gradient_checkpointing = True
|
243 |
+
|
244 |
+
def _set_gradient_checkpointing(self, module, value=False):
|
245 |
+
if isinstance(module, Encoder):
|
246 |
+
module.activation_checkpointing = value
|
247 |
+
|
248 |
+
def _init_weights(self, module):
|
249 |
+
std = math.sqrt(2.0 / (5.0 * self.hidden_size))
|
250 |
+
|
251 |
+
if isinstance(module, nn.Linear):
|
252 |
+
nn.init.trunc_normal_(module.weight.data, mean=0.0, std=std, a=-2*std, b=2*std)
|
253 |
+
if module.bias is not None:
|
254 |
+
module.bias.data.zero_()
|
255 |
+
elif isinstance(module, nn.Embedding):
|
256 |
+
nn.init.trunc_normal_(module.weight.data, mean=0.0, std=std, a=-2*std, b=2*std)
|
257 |
+
elif isinstance(module, nn.LayerNorm):
|
258 |
+
module.bias.data.zero_()
|
259 |
+
module.weight.data.fill_(1.0)
|
260 |
+
|
261 |
+
|
262 |
+
class LtgbertModel(LtgbertPreTrainedModel):
|
263 |
+
def __init__(self, config, add_mlm_layer=False, gradient_checkpointing=False, **kwargs):
|
264 |
+
super().__init__(config, **kwargs)
|
265 |
+
self.config = config
|
266 |
+
self.hidden_size = config.hidden_size
|
267 |
+
|
268 |
+
self.embedding = Embedding(config)
|
269 |
+
self.transformer = Encoder(config, activation_checkpointing=gradient_checkpointing)
|
270 |
+
self.classifier = MaskClassifier(config, self.embedding.word_embedding.weight) if add_mlm_layer else None
|
271 |
+
|
272 |
+
|
273 |
+
def get_input_embeddings(self):
|
274 |
+
return self.embedding.word_embedding
|
275 |
+
|
276 |
+
def set_input_embeddings(self, value):
|
277 |
+
self.embedding.word_embedding = value
|
278 |
+
|
279 |
+
def get_contextualized_embeddings(
|
280 |
+
self,
|
281 |
+
input_ids: Optional[torch.Tensor] = None,
|
282 |
+
attention_mask: Optional[torch.Tensor] = None
|
283 |
+
) -> List[torch.Tensor]:
|
284 |
+
if input_ids is not None:
|
285 |
+
input_shape = input_ids.size()
|
286 |
+
else:
|
287 |
+
raise ValueError("You have to specify input_ids")
|
288 |
+
|
289 |
+
batch_size, seq_length = input_shape
|
290 |
+
device = input_ids.device
|
291 |
+
|
292 |
+
if attention_mask is None:
|
293 |
+
attention_mask = torch.zeros(batch_size, seq_length, dtype=torch.bool, device=device)
|
294 |
+
else:
|
295 |
+
attention_mask = ~attention_mask.bool()
|
296 |
+
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
297 |
+
|
298 |
+
static_embeddings, relative_embedding = self.embedding(input_ids.t())
|
299 |
+
contextualized_embeddings, attention_probs = self.transformer(static_embeddings, attention_mask, relative_embedding)
|
300 |
+
contextualized_embeddings = [e.transpose(0, 1) for e in contextualized_embeddings]
|
301 |
+
last_layer = contextualized_embeddings[-1]
|
302 |
+
contextualized_embeddings = [contextualized_embeddings[0]] + [
|
303 |
+
contextualized_embeddings[i] - contextualized_embeddings[i - 1]
|
304 |
+
for i in range(1, len(contextualized_embeddings))
|
305 |
+
]
|
306 |
+
return last_layer, contextualized_embeddings, attention_probs
|
307 |
+
|
308 |
+
def forward(
|
309 |
+
self,
|
310 |
+
input_ids: Optional[torch.Tensor] = None,
|
311 |
+
attention_mask: Optional[torch.Tensor] = None,
|
312 |
+
token_type_ids: Optional[torch.Tensor] = None,
|
313 |
+
position_ids: Optional[torch.Tensor] = None,
|
314 |
+
output_hidden_states: Optional[bool] = None,
|
315 |
+
output_attentions: Optional[bool] = None,
|
316 |
+
return_dict: Optional[bool] = None,
|
317 |
+
**kwargs
|
318 |
+
) -> Union[Tuple[torch.Tensor], BaseModelOutput]:
|
319 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
320 |
+
|
321 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
|
322 |
+
|
323 |
+
if not return_dict:
|
324 |
+
return (
|
325 |
+
sequence_output,
|
326 |
+
*([contextualized_embeddings] if output_hidden_states else []),
|
327 |
+
*([attention_probs] if output_attentions else [])
|
328 |
+
)
|
329 |
+
|
330 |
+
return BaseModelOutput(
|
331 |
+
last_hidden_state=sequence_output,
|
332 |
+
hidden_states=contextualized_embeddings if output_hidden_states else None,
|
333 |
+
attentions=attention_probs if output_attentions else None
|
334 |
+
)
|
335 |
+
|
336 |
+
|
337 |
+
class LtgbertForMaskedLM(LtgbertModel):
|
338 |
+
_keys_to_ignore_on_load_unexpected = ["head"]
|
339 |
+
|
340 |
+
def __init__(self, config, **kwargs):
|
341 |
+
super().__init__(config, add_mlm_layer=True, **kwargs)
|
342 |
+
|
343 |
+
def get_output_embeddings(self):
|
344 |
+
return self.classifier.nonlinearity[-1].weight
|
345 |
+
|
346 |
+
def set_output_embeddings(self, new_embeddings):
|
347 |
+
self.classifier.nonlinearity[-1].weight = new_embeddings
|
348 |
+
|
349 |
+
def forward(
|
350 |
+
self,
|
351 |
+
input_ids: Optional[torch.Tensor] = None,
|
352 |
+
attention_mask: Optional[torch.Tensor] = None,
|
353 |
+
token_type_ids: Optional[torch.Tensor] = None,
|
354 |
+
position_ids: Optional[torch.Tensor] = None,
|
355 |
+
output_hidden_states: Optional[bool] = None,
|
356 |
+
output_attentions: Optional[bool] = None,
|
357 |
+
return_dict: Optional[bool] = None,
|
358 |
+
labels: Optional[torch.LongTensor] = None,
|
359 |
+
**kwargs
|
360 |
+
) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
|
361 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
362 |
+
|
363 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
|
364 |
+
subword_prediction = self.classifier(sequence_output)
|
365 |
+
subword_prediction[:, :, :106+1] = float("-inf")
|
366 |
+
|
367 |
+
masked_lm_loss = None
|
368 |
+
if labels is not None:
|
369 |
+
masked_lm_loss = F.cross_entropy(subword_prediction.flatten(0, 1), labels.flatten())
|
370 |
+
|
371 |
+
if not return_dict:
|
372 |
+
output = (
|
373 |
+
subword_prediction,
|
374 |
+
*([contextualized_embeddings] if output_hidden_states else []),
|
375 |
+
*([attention_probs] if output_attentions else [])
|
376 |
+
)
|
377 |
+
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
|
378 |
+
|
379 |
+
return MaskedLMOutput(
|
380 |
+
loss=masked_lm_loss,
|
381 |
+
logits=subword_prediction,
|
382 |
+
hidden_states=contextualized_embeddings if output_hidden_states else None,
|
383 |
+
attentions=attention_probs if output_attentions else None
|
384 |
+
)
|
385 |
+
|
386 |
+
|
387 |
+
class Classifier(nn.Module):
|
388 |
+
def __init__(self, config, num_labels: int):
|
389 |
+
super().__init__()
|
390 |
+
|
391 |
+
drop_out = getattr(config, "cls_dropout", None)
|
392 |
+
drop_out = config.hidden_dropout_prob if drop_out is None else drop_out
|
393 |
+
|
394 |
+
self.nonlinearity = nn.Sequential(
|
395 |
+
nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False),
|
396 |
+
nn.Linear(config.hidden_size, config.hidden_size),
|
397 |
+
nn.GELU(),
|
398 |
+
nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False),
|
399 |
+
nn.Dropout(drop_out),
|
400 |
+
nn.Linear(config.hidden_size, num_labels)
|
401 |
+
)
|
402 |
+
|
403 |
+
def forward(self, x):
|
404 |
+
x = self.nonlinearity(x)
|
405 |
+
return x
|
406 |
+
|
407 |
+
|
408 |
+
class LtgbertForSequenceClassification(LtgbertModel):
|
409 |
+
_keys_to_ignore_on_load_unexpected = ["classifier"]
|
410 |
+
_keys_to_ignore_on_load_missing = ["head"]
|
411 |
+
|
412 |
+
def __init__(self, config, **kwargs):
|
413 |
+
super().__init__(config, add_mlm_layer=False, **kwargs)
|
414 |
+
|
415 |
+
self.num_labels = config.num_labels
|
416 |
+
self.head = Classifier(config, self.num_labels)
|
417 |
+
|
418 |
+
def forward(
|
419 |
+
self,
|
420 |
+
input_ids: Optional[torch.Tensor] = None,
|
421 |
+
attention_mask: Optional[torch.Tensor] = None,
|
422 |
+
token_type_ids: Optional[torch.Tensor] = None,
|
423 |
+
position_ids: Optional[torch.Tensor] = None,
|
424 |
+
output_attentions: Optional[bool] = None,
|
425 |
+
output_hidden_states: Optional[bool] = None,
|
426 |
+
return_dict: Optional[bool] = None,
|
427 |
+
labels: Optional[torch.LongTensor] = None,
|
428 |
+
**kwargs
|
429 |
+
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
|
430 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
431 |
+
|
432 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
|
433 |
+
logits = self.head(sequence_output[:, 0, :])
|
434 |
+
|
435 |
+
loss = None
|
436 |
+
if labels is not None:
|
437 |
+
if self.config.problem_type is None:
|
438 |
+
if self.num_labels == 1:
|
439 |
+
self.config.problem_type = "regression"
|
440 |
+
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
|
441 |
+
self.config.problem_type = "single_label_classification"
|
442 |
+
else:
|
443 |
+
self.config.problem_type = "multi_label_classification"
|
444 |
+
|
445 |
+
if self.config.problem_type == "regression":
|
446 |
+
loss_fct = nn.MSELoss()
|
447 |
+
if self.num_labels == 1:
|
448 |
+
loss = loss_fct(logits.squeeze(), labels.squeeze())
|
449 |
+
else:
|
450 |
+
loss = loss_fct(logits, labels)
|
451 |
+
elif self.config.problem_type == "single_label_classification":
|
452 |
+
loss_fct = nn.CrossEntropyLoss()
|
453 |
+
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
454 |
+
elif self.config.problem_type == "multi_label_classification":
|
455 |
+
loss_fct = nn.BCEWithLogitsLoss()
|
456 |
+
loss = loss_fct(logits, labels)
|
457 |
+
|
458 |
+
if not return_dict:
|
459 |
+
output = (
|
460 |
+
logits,
|
461 |
+
*([contextualized_embeddings] if output_hidden_states else []),
|
462 |
+
*([attention_probs] if output_attentions else [])
|
463 |
+
)
|
464 |
+
return ((loss,) + output) if loss is not None else output
|
465 |
+
|
466 |
+
return SequenceClassifierOutput(
|
467 |
+
loss=loss,
|
468 |
+
logits=logits,
|
469 |
+
hidden_states=contextualized_embeddings if output_hidden_states else None,
|
470 |
+
attentions=attention_probs if output_attentions else None
|
471 |
+
)
|
472 |
+
|
473 |
+
|
474 |
+
class LtgbertForTokenClassification(LtgbertModel):
|
475 |
+
_keys_to_ignore_on_load_unexpected = ["classifier"]
|
476 |
+
_keys_to_ignore_on_load_missing = ["head"]
|
477 |
+
|
478 |
+
def __init__(self, config, **kwargs):
|
479 |
+
super().__init__(config, add_mlm_layer=False, **kwargs)
|
480 |
+
|
481 |
+
self.num_labels = config.num_labels
|
482 |
+
self.head = Classifier(config, self.num_labels)
|
483 |
+
|
484 |
+
def forward(
|
485 |
+
self,
|
486 |
+
input_ids: Optional[torch.Tensor] = None,
|
487 |
+
attention_mask: Optional[torch.Tensor] = None,
|
488 |
+
token_type_ids: Optional[torch.Tensor] = None,
|
489 |
+
position_ids: Optional[torch.Tensor] = None,
|
490 |
+
output_attentions: Optional[bool] = None,
|
491 |
+
output_hidden_states: Optional[bool] = None,
|
492 |
+
return_dict: Optional[bool] = None,
|
493 |
+
labels: Optional[torch.LongTensor] = None,
|
494 |
+
**kwargs
|
495 |
+
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
|
496 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
497 |
+
|
498 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
|
499 |
+
logits = self.head(sequence_output)
|
500 |
+
|
501 |
+
loss = None
|
502 |
+
if labels is not None:
|
503 |
+
loss_fct = nn.CrossEntropyLoss()
|
504 |
+
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
505 |
+
|
506 |
+
if not return_dict:
|
507 |
+
output = (
|
508 |
+
logits,
|
509 |
+
*([contextualized_embeddings] if output_hidden_states else []),
|
510 |
+
*([attention_probs] if output_attentions else [])
|
511 |
+
)
|
512 |
+
return ((loss,) + output) if loss is not None else output
|
513 |
+
|
514 |
+
return TokenClassifierOutput(
|
515 |
+
loss=loss,
|
516 |
+
logits=logits,
|
517 |
+
hidden_states=contextualized_embeddings if output_hidden_states else None,
|
518 |
+
attentions=attention_probs if output_attentions else None
|
519 |
+
)
|
520 |
+
|
521 |
+
|
522 |
+
class LtgbertForQuestionAnswering(LtgbertModel):
|
523 |
+
_keys_to_ignore_on_load_unexpected = ["classifier"]
|
524 |
+
_keys_to_ignore_on_load_missing = ["head"]
|
525 |
+
|
526 |
+
def __init__(self, config, **kwargs):
|
527 |
+
super().__init__(config, add_mlm_layer=False, **kwargs)
|
528 |
+
|
529 |
+
self.num_labels = config.num_labels
|
530 |
+
self.head = Classifier(config, self.num_labels)
|
531 |
+
|
532 |
+
def forward(
|
533 |
+
self,
|
534 |
+
input_ids: Optional[torch.Tensor] = None,
|
535 |
+
attention_mask: Optional[torch.Tensor] = None,
|
536 |
+
token_type_ids: Optional[torch.Tensor] = None,
|
537 |
+
position_ids: Optional[torch.Tensor] = None,
|
538 |
+
output_attentions: Optional[bool] = None,
|
539 |
+
output_hidden_states: Optional[bool] = None,
|
540 |
+
return_dict: Optional[bool] = None,
|
541 |
+
start_positions: Optional[torch.Tensor] = None,
|
542 |
+
end_positions: Optional[torch.Tensor] = None,
|
543 |
+
**kwargs
|
544 |
+
) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
|
545 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
546 |
+
|
547 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
|
548 |
+
logits = self.head(sequence_output)
|
549 |
+
|
550 |
+
start_logits, end_logits = logits.split(1, dim=-1)
|
551 |
+
start_logits = start_logits.squeeze(-1).contiguous()
|
552 |
+
end_logits = end_logits.squeeze(-1).contiguous()
|
553 |
+
|
554 |
+
total_loss = None
|
555 |
+
if start_positions is not None and end_positions is not None:
|
556 |
+
# If we are on multi-GPU, split add a dimension
|
557 |
+
if len(start_positions.size()) > 1:
|
558 |
+
start_positions = start_positions.squeeze(-1)
|
559 |
+
if len(end_positions.size()) > 1:
|
560 |
+
end_positions = end_positions.squeeze(-1)
|
561 |
+
|
562 |
+
# sometimes the start/end positions are outside our model inputs, we ignore these terms
|
563 |
+
ignored_index = start_logits.size(1)
|
564 |
+
start_positions = start_positions.clamp(0, ignored_index)
|
565 |
+
end_positions = end_positions.clamp(0, ignored_index)
|
566 |
+
|
567 |
+
loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
|
568 |
+
start_loss = loss_fct(start_logits, start_positions)
|
569 |
+
end_loss = loss_fct(end_logits, end_positions)
|
570 |
+
total_loss = (start_loss + end_loss) / 2
|
571 |
+
|
572 |
+
if not return_dict:
|
573 |
+
output = (
|
574 |
+
start_logits,
|
575 |
+
end_logits,
|
576 |
+
*([contextualized_embeddings] if output_hidden_states else []),
|
577 |
+
*([attention_probs] if output_attentions else [])
|
578 |
+
)
|
579 |
+
return ((total_loss,) + output) if total_loss is not None else output
|
580 |
+
|
581 |
+
return QuestionAnsweringModelOutput(
|
582 |
+
loss=total_loss,
|
583 |
+
start_logits=start_logits,
|
584 |
+
end_logits=end_logits,
|
585 |
+
hidden_states=contextualized_embeddings if output_hidden_states else None,
|
586 |
+
attentions=attention_probs if output_attentions else None
|
587 |
+
)
|
588 |
+
|
589 |
+
|
590 |
+
class LtgbertForMultipleChoice(LtgbertModel):
|
591 |
+
_keys_to_ignore_on_load_unexpected = ["classifier"]
|
592 |
+
_keys_to_ignore_on_load_missing = ["head"]
|
593 |
+
|
594 |
+
def __init__(self, config, **kwargs):
|
595 |
+
super().__init__(config, add_mlm_layer=False, **kwargs)
|
596 |
+
|
597 |
+
self.num_labels = getattr(config, "num_labels", 2)
|
598 |
+
self.head = Classifier(config, self.num_labels)
|
599 |
+
|
600 |
+
def forward(
|
601 |
+
self,
|
602 |
+
input_ids: Optional[torch.Tensor] = None,
|
603 |
+
attention_mask: Optional[torch.Tensor] = None,
|
604 |
+
token_type_ids: Optional[torch.Tensor] = None,
|
605 |
+
position_ids: Optional[torch.Tensor] = None,
|
606 |
+
labels: Optional[torch.Tensor] = None,
|
607 |
+
output_attentions: Optional[bool] = None,
|
608 |
+
output_hidden_states: Optional[bool] = None,
|
609 |
+
return_dict: Optional[bool] = None,
|
610 |
+
**kwargs
|
611 |
+
) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
|
612 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
613 |
+
num_choices = input_ids.shape[1]
|
614 |
+
|
615 |
+
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
|
616 |
+
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
|
617 |
+
|
618 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(flat_input_ids, flat_attention_mask)
|
619 |
+
logits = self.head(sequence_output)
|
620 |
+
reshaped_logits = logits.view(-1, num_choices)
|
621 |
+
|
622 |
+
loss = None
|
623 |
+
if labels is not None:
|
624 |
+
loss_fct = nn.CrossEntropyLoss()
|
625 |
+
loss = loss_fct(reshaped_logits, labels)
|
626 |
+
|
627 |
+
if not return_dict:
|
628 |
+
output = (
|
629 |
+
reshaped_logits,
|
630 |
+
*([contextualized_embeddings] if output_hidden_states else []),
|
631 |
+
*([attention_probs] if output_attentions else [])
|
632 |
+
)
|
633 |
+
return ((loss,) + output) if loss is not None else output
|
634 |
+
|
635 |
+
return MultipleChoiceModelOutput(
|
636 |
+
loss=loss,
|
637 |
+
logits=reshaped_logits,
|
638 |
+
hidden_states=contextualized_embeddings if output_hidden_states else None,
|
639 |
+
attentions=attention_probs if output_attentions else None
|
640 |
+
)
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f9181b8a738484af1e19cd6b71b2b2f5f4e47b92c115d0a34120f85807ce9081
|
3 |
+
size 525162806
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "[BOS]", "eos_token": "[EOS]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
3 |
+
"bos_token": "[BOS]",
|
4 |
+
"eos_token": "[EOS]",
|
5 |
+
"unk_token": "[UNK]",
|
6 |
+
"sep_token": "[SEP]",
|
7 |
+
"pad_token": "[PAD]",
|
8 |
+
"cls_token": "[CLS]",
|
9 |
+
"mask_token": "[MASK]"
|
10 |
+
}
|