fix tokenizer post-processor, clean rope buffer code

Files changed (3) hide show

README.md CHANGED Viewed

@@ -1,3 +1,30 @@
----
-license: mit
----

+---
+library_name: transformers
+pipeline_tag: fill-mask
+---
+# MyBERT (RoPE + Pre-LN, ~21M params)
+Custom BERT-style encoder trained with MLM on packed BookCorpus.
+Trust remote code is required because the model uses RoPE.
+```python
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+import torch, torch.nn.functional as F
+tok = AutoTokenizer.from_pretrained("USERNAME/REPO")
+mdl = AutoModelForMaskedLM.from_pretrained("USERNAME/REPO", trust_remote_code=True).eval()
+text = f"the capital of france is {tok.mask_token}."
+enc  = tok(text, return_tensors="pt")
+with torch.no_grad():
+    logits = mdl(**enc).logits
+mask_pos = (enc["input_ids"][0] == tok.mask_token_id).nonzero()[0, 0]
+probs = F.softmax(logits[0, mask_pos], dim=-1)
+for p, i in zip(*[t.tolist() for t in probs.topk(5)]):
+    print(f"{p:.4f}  {tok.decode([i])!r}")
+```
+> **Note:** This is a small model trained for limited compute. It does not have
+> strong factual knowledge and is best used as a base for fine-tuning on a
+> downstream task.

modeling_mybert.py CHANGED Viewed

@@ -211,10 +211,6 @@ class MyBertModel(MyBertPreTrainedModel):
         self.embeddings = MyBertEmbeddings(config)
         self.encoder = MyBertEncoder(config)
-        head_dim = config.hidden_size // config.num_attention_heads
-        cos, sin = _build_rope_cache(head_dim, config.max_position_embeddings, config.rope_theta)
-        self.register_buffer("rope_cos", cos, persistent=True)
-        self.register_buffer("rope_sin", sin, persistent=True)
         self.post_init()

         self.embeddings = MyBertEmbeddings(config)
         self.encoder = MyBertEncoder(config)
         self.post_init()

tokenizer.json CHANGED Viewed

@@ -953,77 +953,28 @@
   "post_processor": {
     "type": "TemplateProcessing",
     "single": [
-      {
-        "SpecialToken": {
-          "id": "[CLS]",
-          "type_id": 0
-        }
-      },
       {
         "Sequence": {
           "id": "A",
           "type_id": 0
         }
-      },
-      {
-        "SpecialToken": {
-          "id": "[SEP]",
-          "type_id": 0
-        }
       }
     ],
     "pair": [
-      {
-        "SpecialToken": {
-          "id": "[CLS]",
-          "type_id": 0
-        }
-      },
       {
         "Sequence": {
           "id": "A",
           "type_id": 0
         }
       },
-      {
-        "SpecialToken": {
-          "id": "[SEP]",
-          "type_id": 0
-        }
-      },
       {
         "Sequence": {
           "id": "B",
           "type_id": 1
         }
-      },
-      {
-        "SpecialToken": {
-          "id": "[SEP]",
-          "type_id": 1
-        }
       }
     ],
-    "special_tokens": {
-      "[CLS]": {
-        "id": "[CLS]",
-        "ids": [
-          101
-        ],
-        "tokens": [
-          "[CLS]"
-        ]
-      },
-      "[SEP]": {
-        "id": "[SEP]",
-        "ids": [
-          102
-        ],
-        "tokens": [
-          "[SEP]"
-        ]
-      }
-    }
   },
   "decoder": {
     "type": "WordPiece",

   "post_processor": {
     "type": "TemplateProcessing",
     "single": [
       {
         "Sequence": {
           "id": "A",
           "type_id": 0
         }
       }
     ],
     "pair": [
       {
         "Sequence": {
           "id": "A",
           "type_id": 0
         }
       },
       {
         "Sequence": {
           "id": "B",
           "type_id": 1
         }
       }
     ],
+    "special_tokens": {}
   },
   "decoder": {
     "type": "WordPiece",