Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

File size: 2,964 Bytes

751936e

"""

tokenizer类型：HFTokenizer


## Run



## 来源

- https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
- https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/tokenizer

"""

import json
import ftfy
from gpt_nexo_20b.tokenizer import build_tokenizer


class Encoder(object):
    def __init__(self, args):
        self.args = args

    def initializer(self):
        # Use Encoder class as a container for global data
        Encoder.tokenizer = build_tokenizer(self.args)

    def encode(self, text):
        if self.args.ftfy:
            text = ftfy.fix_text(text)
        ids = {}
        text_ids = Encoder.tokenizer.tokenize(text)
        return text_ids


class HFConfig:
    """
    jsonl_keys 是干嘛的？

    对应的配置文件：https://github.com/EleutherAI/gpt-neox/blob/main/configs/20B.yml
    "vocab-file": "./20B_checkpoints/20B_tokenizer.json",
    "tokenizer_type": "HFTokenizer",
    """
    def __init__(self):
        self.append_eod = True
        self.ftfy = False
        self.keep_empty = False
        self.log_interval = 100
        self.make_vocab_size_divisible_by = 128
        self.model_parallel_size = 1
        self.padded_vocab_size = 50304
        self.rank = 0
        self.tokenizer_type = 'HFTokenizer'
        self.vocab_file = '20B_tokenizer.json'


class GPTConfig:
    """
    对应的配置文件：https://github.com/EleutherAI/gpt-neox/blob/main/configs/local_setup.yml
    "vocab-file": "data/gpt2-vocab.json",
    "merge-file": "data/gpt2-merges.txt",

    "tokenizer_type": Default = GPT2BPETokenizer  # 默认值
    """
    def __init__(self):
        self.input = './data/enwik8/enwik8.zip'
        self.merge_file = './data/gpt2-merges.txt'
        self.workers = 1

class BERTConfig:
    """ 好像不支持
    "vocab-file": "./20B_checkpoints/20B_tokenizer.json",
    "tokenizer_type": "HFTokenizer",
    """
    pass


def test():
    args = HFConfig()
    encoder = Encoder(args)
    tokenizer = build_tokenizer(args)
    print(f"Vocab size: {tokenizer.vocab_size}")
    encoder.initializer()

    tokens = encoder.encode("中国\ngood job一个人去哪里")
    # 13609 中
    # 23197 国
    print(tokens)
    for token in tokens:
        print(token, Encoder.tokenizer.detokenize([token]))



def convert_vocab():
    vocab = json.load(open("20B_tokenizer.json", "r", encoding="utf-8"))
    json.dump(vocab, open("20B_tokenizer.zh.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)


def dump_vocab():
    args = HFConfig()
    tokenizer = build_tokenizer(args)
    print(f"Vocab size: {tokenizer.vocab_size}")
    with open("20B.vocab.txt", "w", encoding="utf-8") as f_out:
        for token in tokenizer.vocab:
            f_out.write(token + "\n")

"""
13609 中
23197 国
187 

12311 good
2628  job
27896 一个
13484 人
44781 去
20833 �
105 �
42013 里
"""



if __name__ == "__main__":
    test()
    # convert_vocab()
    # dump_vocab()