import random import youtokentome as yttm bpe = yttm.BPE(model="e.model") train_data_path = "train_data.txt" model_path = "example.model" # Generating random file with training data # 10000 lines with 100 characters in each line n_lines = 10000 n_characters = 100 with open(train_data_path, "w") as fout: for _ in range(n_lines): print("".join([random.choice("abcd ") for _ in range(n_characters)]), file=fout) # Generating random text test_text = "".join([random.choice("abcde ") for _ in range(100)]) # Training model yttm.BPE.train(data=train_data_path, vocab_size=5000, model=model_path) # Loading model bpe = yttm.BPE(model=model_path) # Two types of tokenization print(bpe.encode([test_text], output_type=yttm.OutputType.ID)) print(bpe.encode([test_text], output_type=yttm.OutputType.SUBWORD))