artsousa's picture
trainned tokenizer with 50265 vocab size
904d64e
from tokenizers import ByteLevelBPETokenizer
from datasets import load_from_disk
import os
import argparse
def parse_arguments(parser):
parser.add_argument(
"--dataset-dir",
required=True,
help="Define dataset folder",
)
parser.add_argument(
"--out",
default="./",
type=str,
help="Path to the output directory, where the files will be saved",
)
parser.add_argument(
"--name",
default="bpe-bytelevel",
type=str,
help="The name of the output vocab files"
)
return parser.parse_args()
def main(args):
if not os.path.isdir(args.dataset_dir):
raise Exception('--dataset-dir not exists')
os.makedirs(args.out, exist_ok=True)
print('LOADING %s...' % (args.dataset_dir))
raw_dataset = load_from_disk(args.dataset_dir)
print(raw_dataset)
def batch_iterator(batch_size=10000):
for i in range(0, len(raw_dataset), batch_size):
yield raw_dataset[i : i + batch_size]['text']
tokenizer = ByteLevelBPETokenizer(
lowercase=False,
)
tokenizer.enable_truncation(max_length=512)
tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>",
])
print("SAVING TOKENIZER CONFIG INTO...")
tokenizer.save("{}/tokenizer.json".format(args.out))
tokenizer.save_model(args.out, args.name)
print("SO FAR SO GOOD...")
if __name__ == '__main__':
args = parse_arguments(argparse.ArgumentParser())
print(args)
try:
main(args)
except Exception as excp:
print(excp)