kd13 commited on
Commit
263d80b
·
verified ·
1 Parent(s): 68b16ff

fix tokenizer post-processor, clean rope buffer code

Browse files
Files changed (3) hide show
  1. README.md +30 -3
  2. modeling_mybert.py +0 -4
  3. tokenizer.json +1 -50
README.md CHANGED
@@ -1,3 +1,30 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ pipeline_tag: fill-mask
4
+ ---
5
+
6
+ # MyBERT (RoPE + Pre-LN, ~21M params)
7
+
8
+ Custom BERT-style encoder trained with MLM on packed BookCorpus.
9
+ Trust remote code is required because the model uses RoPE.
10
+
11
+ ```python
12
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
13
+ import torch, torch.nn.functional as F
14
+
15
+ tok = AutoTokenizer.from_pretrained("USERNAME/REPO")
16
+ mdl = AutoModelForMaskedLM.from_pretrained("USERNAME/REPO", trust_remote_code=True).eval()
17
+
18
+ text = f"the capital of france is {tok.mask_token}."
19
+ enc = tok(text, return_tensors="pt")
20
+ with torch.no_grad():
21
+ logits = mdl(**enc).logits
22
+ mask_pos = (enc["input_ids"][0] == tok.mask_token_id).nonzero()[0, 0]
23
+ probs = F.softmax(logits[0, mask_pos], dim=-1)
24
+ for p, i in zip(*[t.tolist() for t in probs.topk(5)]):
25
+ print(f"{p:.4f} {tok.decode([i])!r}")
26
+ ```
27
+
28
+ > **Note:** This is a small model trained for limited compute. It does not have
29
+ > strong factual knowledge and is best used as a base for fine-tuning on a
30
+ > downstream task.
modeling_mybert.py CHANGED
@@ -211,10 +211,6 @@ class MyBertModel(MyBertPreTrainedModel):
211
  self.embeddings = MyBertEmbeddings(config)
212
  self.encoder = MyBertEncoder(config)
213
 
214
- head_dim = config.hidden_size // config.num_attention_heads
215
- cos, sin = _build_rope_cache(head_dim, config.max_position_embeddings, config.rope_theta)
216
- self.register_buffer("rope_cos", cos, persistent=True)
217
- self.register_buffer("rope_sin", sin, persistent=True)
218
 
219
  self.post_init()
220
 
 
211
  self.embeddings = MyBertEmbeddings(config)
212
  self.encoder = MyBertEncoder(config)
213
 
 
 
 
 
214
 
215
  self.post_init()
216
 
tokenizer.json CHANGED
@@ -953,77 +953,28 @@
953
  "post_processor": {
954
  "type": "TemplateProcessing",
955
  "single": [
956
- {
957
- "SpecialToken": {
958
- "id": "[CLS]",
959
- "type_id": 0
960
- }
961
- },
962
  {
963
  "Sequence": {
964
  "id": "A",
965
  "type_id": 0
966
  }
967
- },
968
- {
969
- "SpecialToken": {
970
- "id": "[SEP]",
971
- "type_id": 0
972
- }
973
  }
974
  ],
975
  "pair": [
976
- {
977
- "SpecialToken": {
978
- "id": "[CLS]",
979
- "type_id": 0
980
- }
981
- },
982
  {
983
  "Sequence": {
984
  "id": "A",
985
  "type_id": 0
986
  }
987
  },
988
- {
989
- "SpecialToken": {
990
- "id": "[SEP]",
991
- "type_id": 0
992
- }
993
- },
994
  {
995
  "Sequence": {
996
  "id": "B",
997
  "type_id": 1
998
  }
999
- },
1000
- {
1001
- "SpecialToken": {
1002
- "id": "[SEP]",
1003
- "type_id": 1
1004
- }
1005
  }
1006
  ],
1007
- "special_tokens": {
1008
- "[CLS]": {
1009
- "id": "[CLS]",
1010
- "ids": [
1011
- 101
1012
- ],
1013
- "tokens": [
1014
- "[CLS]"
1015
- ]
1016
- },
1017
- "[SEP]": {
1018
- "id": "[SEP]",
1019
- "ids": [
1020
- 102
1021
- ],
1022
- "tokens": [
1023
- "[SEP]"
1024
- ]
1025
- }
1026
- }
1027
  },
1028
  "decoder": {
1029
  "type": "WordPiece",
 
953
  "post_processor": {
954
  "type": "TemplateProcessing",
955
  "single": [
 
 
 
 
 
 
956
  {
957
  "Sequence": {
958
  "id": "A",
959
  "type_id": 0
960
  }
 
 
 
 
 
 
961
  }
962
  ],
963
  "pair": [
 
 
 
 
 
 
964
  {
965
  "Sequence": {
966
  "id": "A",
967
  "type_id": 0
968
  }
969
  },
 
 
 
 
 
 
970
  {
971
  "Sequence": {
972
  "id": "B",
973
  "type_id": 1
974
  }
 
 
 
 
 
 
975
  }
976
  ],
977
+ "special_tokens": {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
978
  },
979
  "decoder": {
980
  "type": "WordPiece",