File size: 822 Bytes
7156337
 
9495a4f
7156337
9495a4f
 
7156337
9495a4f
7156337
9495a4f
 
7156337
9495a4f
 
 
 
7156337
9495a4f
 
 
7156337
9495a4f
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import copy
import json
from tokenizers import Tokenizer

def export_mock_tokenizer():
    input_path = "20B_tokenizer_chinese.json"

    tokenizer = json.load(open(input_path, "r", encoding="utf-8"))

    vocab = tokenizer["model"]["vocab"]
    added_tokens = [token["id"] for token in tokenizer["added_tokens"]]

    for k, v in copy.deepcopy(vocab).items():
        if v not in added_tokens:
            vocab[str(v)] = v
            vocab.pop(k)

    out_path = input_path.replace(".json", ".mock.json")
    with open(out_path, "w", encoding="utf-8") as f_out:
        f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2))


def mock2():
    pass


def load_mock_tokenizer():
    tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.mock.json")
    print('')

export_mock_tokenizer()
load_mock_tokenizer()