File size: 1,528 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
"""
merge 是干嘛的?

## 结果

共merge 4357 个 token
"""

import json
from tokenizers import Tokenizer


oov_tokens = [line.strip().split("\t")[0] for line in open("../gpt_neox_chinese_v1/oov.txt", "r", encoding="utf-8")]


def load_base_tokenizer():
    old_vocab_path = "../gpt_neox_chinese_v1/20B_tokenizer_chinese.json"
    data = json.load(open(old_vocab_path, "r", encoding="utf-8"))
    tokenizer = Tokenizer.from_file(old_vocab_path)
    print("vocab_size with added_tokens:", )
    return data, tokenizer

data, base_tokenizer = load_base_tokenizer()
vocab = data["model"]["vocab"]
merges = data["model"]["merges"]
vocab_size = base_tokenizer.get_vocab_size(with_added_tokens=True)


"""
方式一:原有的added_tokens保持id不变。方式二:原有的added_tokens进行id移位。
以下采用方式一。
"""
new_added_tokens = set()
for word in oov_tokens:
    if len(word) > 1 or word in new_added_tokens:
        continue
    encoding = base_tokenizer.encode(word)
    # if len(encoding.ids) > 1:
    if len(encoding.ids) == 2:  # 3个的,怎么处理?
        tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
        print("merging", word, json.dumps(tokens))
        vocab["".join(tokens)] = vocab_size
        vocab_size += 1
        merges.append(" ".join(tokens))
        new_added_tokens.add(word)


print("共merge %d 个 token" % (len(new_added_tokens)))

f_out = open("20B_tokenizer_chinese.v2.json", "w", encoding="utf-8")

json.dump(data, f_out, indent=2)