File size: 1,095 Bytes
44c70f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53

```python
import json
import os
from transformers.configuration_roberta import RobertaConfig
from transformers import RobertaForMaskedLM, TFRobertaForMaskedLM

DIRNAME = "./dummy-unknown"


config = RobertaConfig(10, 20, 1, 1, 40)

model = RobertaForMaskedLM(config)
model.save_pretrained(DIRNAME)

tf_model = TFRobertaForMaskedLM.from_pretrained(DIRNAME, from_pt=True)
tf_model.save_pretrained(DIRNAME)

# Tokenizer:

vocab = [
    "l",
    "o",
    "w",
    "e",
    "r",
    "s",
    "t",
    "i",
    "d",
    "n",
    "\u0120",
    "\u0120l",
    "\u0120n",
    "\u0120lo",
    "\u0120low",
    "er",
    "\u0120lowest",
    "\u0120newer",
    "\u0120wider",
    "<unk>",
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]

vocab_file = os.path.join(DIRNAME, "vocab.json")
merges_file = os.path.join(DIRNAME, "merges.txt")
with open(vocab_file, "w", encoding="utf-8") as fp:
    fp.write(json.dumps(vocab_tokens) + "\n")
with open(merges_file, "w", encoding="utf-8") as fp:
    fp.write("\n".join(merges))
```