Spaces:
Runtime error
Runtime error
import codecs | |
from SmilesPE.tokenizer import * | |
def load_vocabulary_to_dict(vocabulary_path): | |
vocab_dict = {} | |
with codecs.open(vocabulary_path, 'r', 'utf-8') as file: | |
for index, line in enumerate(file): | |
token = line.strip().split()[0] # Assuming first item is the token | |
vocab_dict[token] = index # Or use the token itself as ID if preferable | |
return vocab_dict | |
def smilespe_tokenizer(smiles_string, vocab_dict): | |
# Initialize SPE_Tokenizer with the vocabulary | |
spe_vob = codecs.open('chembl_smiles_tokenizer30000.txt', 'r', 'utf-8') | |
spe = SPE_Tokenizer(spe_vob) | |
# Tokenize the SMILES string | |
tokenized = spe.tokenize(smiles_string) | |
# Convert tokens to IDs using the vocab_dict | |
token_ids = [vocab_dict[token] for token in tokenized if token in vocab_dict] | |
return tokenized, token_ids | |
# Load the vocabulary into a dictionary | |
# vocab_path = 'chembl_smiles_tokenizer30000.txt' | |
# vocab_dict = load_vocabulary_to_dict(vocab_path) | |
# # Example usage | |
# smiles_string = 'Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1' | |
# tokens, token_ids = smilespe_tokenizer(smiles_string, vocab_dict) | |
# print("Tokens:", tokens) | |
# print("Token IDs:", token_ids) | |