File size: 426 Bytes
7156337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import copy
import json

input_path = "20B_tokenizer_chinese.json"

tokenizer = json.load(open(input_path, "r", encoding="utf-8"))

vocab = tokenizer["model"]["vocab"]


for k, v in copy.deepcopy(vocab).items():
    vocab[str(v)] = v
    vocab.pop(k)

out_path = input_path.replace(".json", ".mock.json")
with open(out_path, "w", encoding="utf-8") as f_out:
    f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2))