|
from tokenizers import ByteLevelBPETokenizer |
|
from datasets import load_from_disk |
|
|
|
import os |
|
import argparse |
|
|
|
|
|
def parse_arguments(parser): |
|
|
|
parser.add_argument( |
|
"--dataset-dir", |
|
required=True, |
|
help="Define dataset folder", |
|
) |
|
|
|
parser.add_argument( |
|
"--out", |
|
default="./", |
|
type=str, |
|
help="Path to the output directory, where the files will be saved", |
|
) |
|
|
|
parser.add_argument( |
|
"--name", |
|
default="bpe-bytelevel", |
|
type=str, |
|
help="The name of the output vocab files" |
|
) |
|
|
|
return parser.parse_args() |
|
|
|
|
|
def main(args): |
|
|
|
if not os.path.isdir(args.dataset_dir): |
|
raise Exception('--dataset-dir not exists') |
|
|
|
os.makedirs(args.out, exist_ok=True) |
|
|
|
print('LOADING %s...' % (args.dataset_dir)) |
|
raw_dataset = load_from_disk(args.dataset_dir) |
|
print(raw_dataset) |
|
|
|
def batch_iterator(batch_size=10000): |
|
for i in range(0, len(raw_dataset), batch_size): |
|
yield raw_dataset[i : i + batch_size]['text'] |
|
|
|
tokenizer = ByteLevelBPETokenizer( |
|
lowercase=False, |
|
) |
|
|
|
tokenizer.enable_truncation(max_length=512) |
|
tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[ |
|
"<s>", |
|
"<pad>", |
|
"</s>", |
|
"<unk>", |
|
"<mask>", |
|
]) |
|
|
|
print("SAVING TOKENIZER CONFIG INTO...") |
|
tokenizer.save("{}/tokenizer.json".format(args.out)) |
|
tokenizer.save_model(args.out, args.name) |
|
|
|
print("SO FAR SO GOOD...") |
|
|
|
|
|
if __name__ == '__main__': |
|
args = parse_arguments(argparse.ArgumentParser()) |
|
print(args) |
|
|
|
try: |
|
main(args) |
|
except Exception as excp: |
|
print(excp) |
|
|
|
|
|
|