def atomwise_tokenizer(smi, exclusive_tokens=None): """ Tokenize a SMILES molecule at atom-level and return tokens with their token IDs. - 'Br' and 'Cl' are two-character tokens. - Symbols with brackets are considered as tokens. - If `exclusive_tokens` is provided, symbols with brackets not in `exclusive_tokens` will be replaced by '[UNK]'. Parameters: - smi (str): SMILES string to tokenize. - exclusive_tokens (list of str, optional): Specific symbols with brackets to keep. Returns: - tuple: (tokens, token_ids), where tokens is a list of atom-level tokens and token_ids is a list of corresponding token IDs. """ import re pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])" regex = re.compile(pattern) tokens = [token for token in regex.findall(smi)] # Handle exclusive tokens, replacing non-exclusive bracketed tokens with '[UNK]' if exclusive_tokens: tokens = [tok if tok in exclusive_tokens or not tok.startswith('[') else '[UNK]' for tok in tokens] # Generating token IDs based on the order of unique token appearance token_to_id = {} token_ids = [] for token in tokens: if token not in token_to_id: # Assign a new ID based on the current size of the dictionary token_to_id[token] = len(token_to_id) token_ids.append(token_to_id[token]) return tokens, token_ids # def atomwise_tokenizer(smi, exclusive_tokens = None): # """ # Tokenize a SMILES molecule at atom-level: # (1) 'Br' and 'Cl' are two-character tokens # (2) Symbols with bracket are considered as tokens # exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]']. # Other symbols with bracket will be replaced by '[UNK]'. default is `None`. # """ # import re # pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])" # regex = re.compile(pattern) # tokens = [token for token in regex.findall(smi)] # if exclusive_tokens: # for i, tok in enumerate(tokens): # if tok.startswith('['): # if tok not in exclusive_tokens: # tokens[i] = '[UNK]' # return tokens