|
import argparse |
|
import glob |
|
import os |
|
from tokenizers import ByteLevelBPETokenizer |
|
from transformers import GPT2TokenizerFast |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--data-files", type=str, required=True) |
|
parser.add_argument("--vocab-size", type=int, required=True) |
|
parser.add_argument("--output-dir", type=str, required=True) |
|
parser.add_argument("--output-file-name", type=str, required=True) |
|
args = parser.parse_args() |
|
|
|
gpt2_tok = ByteLevelBPETokenizer(add_prefix_space=True) |
|
|
|
files = glob.glob(args.data_files) |
|
if len(files) > 10: |
|
print(files[0:10]) |
|
else: |
|
print(files) |
|
|
|
gpt2_tok.train( |
|
files=files, |
|
vocab_size=args.vocab_size, |
|
show_progress=True, |
|
special_tokens=["<|endoftext|>", "<s>", "<pad>", "</s>"], |
|
) |
|
|
|
if not os.path.exists(args.output_dir): |
|
os.makedirs(args.output_dir) |
|
|
|
|
|
gpt2_tok.save( |
|
os.path.join(args.output_dir,"tokenizer.json"), pretty=True |
|
) |
|
gpt2_tok.save_model(args.output_dir, args.output_file_name) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|