import copy import json from tokenizers import Tokenizer def export_mock_tokenizer(): input_path = "20B_tokenizer_chinese.json" tokenizer = json.load(open(input_path, "r", encoding="utf-8")) vocab = tokenizer["model"]["vocab"] added_tokens = [token["id"] for token in tokenizer["added_tokens"]] for k, v in copy.deepcopy(vocab).items(): if v not in added_tokens: vocab[str(v)] = v vocab.pop(k) out_path = input_path.replace(".json", ".mock.json") with open(out_path, "w", encoding="utf-8") as f_out: f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2)) def mock2(): pass def load_mock_tokenizer(): tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.mock.json") print('') export_mock_tokenizer() load_mock_tokenizer()