File size: 2,964 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""

tokenizer类型:HFTokenizer


## Run



## 来源

- https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
- https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/tokenizer

"""

import json
import ftfy
from gpt_nexo_20b.tokenizer import build_tokenizer


class Encoder(object):
    def __init__(self, args):
        self.args = args

    def initializer(self):
        # Use Encoder class as a container for global data
        Encoder.tokenizer = build_tokenizer(self.args)

    def encode(self, text):
        if self.args.ftfy:
            text = ftfy.fix_text(text)
        ids = {}
        text_ids = Encoder.tokenizer.tokenize(text)
        return text_ids


class HFConfig:
    """
    jsonl_keys 是干嘛的?

    对应的配置文件:https://github.com/EleutherAI/gpt-neox/blob/main/configs/20B.yml
    "vocab-file": "./20B_checkpoints/20B_tokenizer.json",
    "tokenizer_type": "HFTokenizer",
    """
    def __init__(self):
        self.append_eod = True
        self.ftfy = False
        self.keep_empty = False
        self.log_interval = 100
        self.make_vocab_size_divisible_by = 128
        self.model_parallel_size = 1
        self.padded_vocab_size = 50304
        self.rank = 0
        self.tokenizer_type = 'HFTokenizer'
        self.vocab_file = '20B_tokenizer.json'


class GPTConfig:
    """
    对应的配置文件:https://github.com/EleutherAI/gpt-neox/blob/main/configs/local_setup.yml
    "vocab-file": "data/gpt2-vocab.json",
    "merge-file": "data/gpt2-merges.txt",

    "tokenizer_type": Default = GPT2BPETokenizer  # 默认值
    """
    def __init__(self):
        self.input = './data/enwik8/enwik8.zip'
        self.merge_file = './data/gpt2-merges.txt'
        self.workers = 1

class BERTConfig:
    """ 好像不支持
    "vocab-file": "./20B_checkpoints/20B_tokenizer.json",
    "tokenizer_type": "HFTokenizer",
    """
    pass


def test():
    args = HFConfig()
    encoder = Encoder(args)
    tokenizer = build_tokenizer(args)
    print(f"Vocab size: {tokenizer.vocab_size}")
    encoder.initializer()

    tokens = encoder.encode("中国\ngood job一个人去哪里")
    # 13609 中
    # 23197 国
    print(tokens)
    for token in tokens:
        print(token, Encoder.tokenizer.detokenize([token]))



def convert_vocab():
    vocab = json.load(open("20B_tokenizer.json", "r", encoding="utf-8"))
    json.dump(vocab, open("20B_tokenizer.zh.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)


def dump_vocab():
    args = HFConfig()
    tokenizer = build_tokenizer(args)
    print(f"Vocab size: {tokenizer.vocab_size}")
    with open("20B.vocab.txt", "w", encoding="utf-8") as f_out:
        for token in tokenizer.vocab:
            f_out.write(token + "\n")

"""
13609 中
23197 国
187 

12311 good
2628  job
27896 一个
13484 人
44781 去
20833 �
105 �
42013 里
"""



if __name__ == "__main__":
    test()
    # convert_vocab()
    # dump_vocab()