fix tokenizer post-processor, clean rope buffer code
Browse files- README.md +30 -3
- modeling_mybert.py +0 -4
- tokenizer.json +1 -50
README.md
CHANGED
|
@@ -1,3 +1,30 @@
|
|
| 1 |
-
---
|
| 2 |
-
|
| 3 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: transformers
|
| 3 |
+
pipeline_tag: fill-mask
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# MyBERT (RoPE + Pre-LN, ~21M params)
|
| 7 |
+
|
| 8 |
+
Custom BERT-style encoder trained with MLM on packed BookCorpus.
|
| 9 |
+
Trust remote code is required because the model uses RoPE.
|
| 10 |
+
|
| 11 |
+
```python
|
| 12 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 13 |
+
import torch, torch.nn.functional as F
|
| 14 |
+
|
| 15 |
+
tok = AutoTokenizer.from_pretrained("USERNAME/REPO")
|
| 16 |
+
mdl = AutoModelForMaskedLM.from_pretrained("USERNAME/REPO", trust_remote_code=True).eval()
|
| 17 |
+
|
| 18 |
+
text = f"the capital of france is {tok.mask_token}."
|
| 19 |
+
enc = tok(text, return_tensors="pt")
|
| 20 |
+
with torch.no_grad():
|
| 21 |
+
logits = mdl(**enc).logits
|
| 22 |
+
mask_pos = (enc["input_ids"][0] == tok.mask_token_id).nonzero()[0, 0]
|
| 23 |
+
probs = F.softmax(logits[0, mask_pos], dim=-1)
|
| 24 |
+
for p, i in zip(*[t.tolist() for t in probs.topk(5)]):
|
| 25 |
+
print(f"{p:.4f} {tok.decode([i])!r}")
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
> **Note:** This is a small model trained for limited compute. It does not have
|
| 29 |
+
> strong factual knowledge and is best used as a base for fine-tuning on a
|
| 30 |
+
> downstream task.
|
modeling_mybert.py
CHANGED
|
@@ -211,10 +211,6 @@ class MyBertModel(MyBertPreTrainedModel):
|
|
| 211 |
self.embeddings = MyBertEmbeddings(config)
|
| 212 |
self.encoder = MyBertEncoder(config)
|
| 213 |
|
| 214 |
-
head_dim = config.hidden_size // config.num_attention_heads
|
| 215 |
-
cos, sin = _build_rope_cache(head_dim, config.max_position_embeddings, config.rope_theta)
|
| 216 |
-
self.register_buffer("rope_cos", cos, persistent=True)
|
| 217 |
-
self.register_buffer("rope_sin", sin, persistent=True)
|
| 218 |
|
| 219 |
self.post_init()
|
| 220 |
|
|
|
|
| 211 |
self.embeddings = MyBertEmbeddings(config)
|
| 212 |
self.encoder = MyBertEncoder(config)
|
| 213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
self.post_init()
|
| 216 |
|
tokenizer.json
CHANGED
|
@@ -953,77 +953,28 @@
|
|
| 953 |
"post_processor": {
|
| 954 |
"type": "TemplateProcessing",
|
| 955 |
"single": [
|
| 956 |
-
{
|
| 957 |
-
"SpecialToken": {
|
| 958 |
-
"id": "[CLS]",
|
| 959 |
-
"type_id": 0
|
| 960 |
-
}
|
| 961 |
-
},
|
| 962 |
{
|
| 963 |
"Sequence": {
|
| 964 |
"id": "A",
|
| 965 |
"type_id": 0
|
| 966 |
}
|
| 967 |
-
},
|
| 968 |
-
{
|
| 969 |
-
"SpecialToken": {
|
| 970 |
-
"id": "[SEP]",
|
| 971 |
-
"type_id": 0
|
| 972 |
-
}
|
| 973 |
}
|
| 974 |
],
|
| 975 |
"pair": [
|
| 976 |
-
{
|
| 977 |
-
"SpecialToken": {
|
| 978 |
-
"id": "[CLS]",
|
| 979 |
-
"type_id": 0
|
| 980 |
-
}
|
| 981 |
-
},
|
| 982 |
{
|
| 983 |
"Sequence": {
|
| 984 |
"id": "A",
|
| 985 |
"type_id": 0
|
| 986 |
}
|
| 987 |
},
|
| 988 |
-
{
|
| 989 |
-
"SpecialToken": {
|
| 990 |
-
"id": "[SEP]",
|
| 991 |
-
"type_id": 0
|
| 992 |
-
}
|
| 993 |
-
},
|
| 994 |
{
|
| 995 |
"Sequence": {
|
| 996 |
"id": "B",
|
| 997 |
"type_id": 1
|
| 998 |
}
|
| 999 |
-
},
|
| 1000 |
-
{
|
| 1001 |
-
"SpecialToken": {
|
| 1002 |
-
"id": "[SEP]",
|
| 1003 |
-
"type_id": 1
|
| 1004 |
-
}
|
| 1005 |
}
|
| 1006 |
],
|
| 1007 |
-
"special_tokens": {
|
| 1008 |
-
"[CLS]": {
|
| 1009 |
-
"id": "[CLS]",
|
| 1010 |
-
"ids": [
|
| 1011 |
-
101
|
| 1012 |
-
],
|
| 1013 |
-
"tokens": [
|
| 1014 |
-
"[CLS]"
|
| 1015 |
-
]
|
| 1016 |
-
},
|
| 1017 |
-
"[SEP]": {
|
| 1018 |
-
"id": "[SEP]",
|
| 1019 |
-
"ids": [
|
| 1020 |
-
102
|
| 1021 |
-
],
|
| 1022 |
-
"tokens": [
|
| 1023 |
-
"[SEP]"
|
| 1024 |
-
]
|
| 1025 |
-
}
|
| 1026 |
-
}
|
| 1027 |
},
|
| 1028 |
"decoder": {
|
| 1029 |
"type": "WordPiece",
|
|
|
|
| 953 |
"post_processor": {
|
| 954 |
"type": "TemplateProcessing",
|
| 955 |
"single": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 956 |
{
|
| 957 |
"Sequence": {
|
| 958 |
"id": "A",
|
| 959 |
"type_id": 0
|
| 960 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 961 |
}
|
| 962 |
],
|
| 963 |
"pair": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 964 |
{
|
| 965 |
"Sequence": {
|
| 966 |
"id": "A",
|
| 967 |
"type_id": 0
|
| 968 |
}
|
| 969 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 970 |
{
|
| 971 |
"Sequence": {
|
| 972 |
"id": "B",
|
| 973 |
"type_id": 1
|
| 974 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 975 |
}
|
| 976 |
],
|
| 977 |
+
"special_tokens": {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 978 |
},
|
| 979 |
"decoder": {
|
| 980 |
"type": "WordPiece",
|