SmilesPeTokenizer / SmilesPeTokenizer.py
saicharan2804
Added token IDs
f23bcf0
raw
history blame
464 Bytes
from tokenizers import Tokenizer
def bpe_tokenizer(smiles_string):
# Load the tokenizer from the saved file
tokenizer = Tokenizer.from_file("chembl_bpe_tokenizer.json")
# Tokenize the SMILES string
encoded_output = tokenizer.encode(smiles_string)
# To get the tokenized output as text
tokens_text = encoded_output.tokens
# To get the corresponding token IDs
token_ids = encoded_output.ids
return tokens_text, token_ids