Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import collections | |
from importlib import resources | |
import os | |
import re | |
from typing import Optional, List | |
import numpy as np | |
from transformers import BertTokenizer | |
SMI_REGEX_PATTERN = r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])""" | |
# \[[^\]]+\] # match anything inside square brackets | |
# |Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p # match elements | |
# |\(|\) # match parentheses | |
# |\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2} # match various symbols | |
# |[0-9] # match digits | |
def sequence_to_kmers(sequence, k=3): | |
""" Divide a string into a list of kmers strings. | |
Parameters: | |
sequence (string) | |
k (int), default 3 | |
Returns: | |
List containing a list of kmers. | |
""" | |
return [sequence[i:i + k] for i in range(len(sequence) - k + 1)] | |
def sequence_to_word_embedding(sequence, model): | |
"""Get protein embedding, infer a list of 3-mers to (num_word, 100) matrix""" | |
kmers = sequence_to_kmers(sequence) | |
vec = np.zeros((len(kmers), 100)) | |
i = 0 | |
for word in kmers: | |
try: | |
vec[i,] = model.wv[word] | |
except KeyError: | |
pass | |
i += 1 | |
return vec | |
def sequence_to_token_ids(sequence, tokenizer): | |
token_ids = tokenizer.encode(sequence) | |
return np.array(token_ids) | |
# def sequence_to_token_ids(sequence, tokenizer, max_length: int): | |
# token_ids = tokenizer.encode(sequence) | |
# length = min(max_length, len(token_ids)) | |
# | |
# token_ids_padded = np.zeros(max_length, dtype='int') | |
# token_ids_padded[:length] = token_ids[:length] | |
# | |
# return token_ids_padded | |
class SmilesTokenizer(BertTokenizer): | |
""" | |
Adapted from https://github.com/deepchem/deepchem/. | |
Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer | |
implementation found in Huggingface's transformers library. It runs a WordPiece tokenization | |
algorithm over SMILES strings using the tokenization SMILES regex developed by Schwaller et al. | |
Please see https://github.com/huggingface/transformers | |
and https://github.com/rxn4chemistry/rxnfp for more details. | |
Examples | |
-------- | |
>>> tokenizer = SmilesTokenizer(vocab_path, regex_pattern) | |
>>> print(tokenizer.encode("CC(=O)OC1=CC=CC=C1C(=O)O")) | |
[12, 16, 16, 17, 22, 19, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 16, 17, 22, 19, 18, 19, 13] | |
References | |
---------- | |
.. [1] Schwaller, Philippe; Probst, Daniel; Vaucher, Alain C.; Nair, Vishnu H; Kreutter, David; | |
Laino, Teodoro; et al. (2019): Mapping the Space of Chemical Reactions using Attention-Based Neural | |
Networks. ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.9897365.v3 | |
Note | |
---- | |
This class requires huggingface's transformers and tokenizers libraries to be installed. | |
""" | |
def __init__( | |
self, | |
vocab_file: str = 'resources/vocabs/smiles.txt', | |
regex_pattern: str = SMI_REGEX_PATTERN, | |
# unk_token="[UNK]", | |
# sep_token="[SEP]", | |
# pad_token="[PAD]", | |
# cls_token="[CLS]", | |
# mask_token="[MASK]", | |
**kwargs): | |
"""Constructs a SmilesTokenizer. | |
Parameters | |
---------- | |
vocab_file: str | |
Path to a SMILES character per line vocabulary file. | |
Default vocab file is found in deepchem/feat/tests/data/vocab.txt | |
""" | |
super().__init__(vocab_file, **kwargs) | |
if not os.path.isfile(vocab_file): | |
raise ValueError( | |
"Can't find a vocab file at path '{}'.".format(vocab_file)) | |
self.vocab = load_vocab(vocab_file) | |
unused_indexes = [i for i, v in enumerate(self.vocab.keys()) if v.startswith("[unused")] | |
self.highest_unused_index = 0 if len(unused_indexes) == 0 else max(unused_indexes) | |
self.ids_to_tokens = collections.OrderedDict([ | |
(ids, tok) for tok, ids in self.vocab.items() | |
]) | |
self.basic_tokenizer = BasicSmilesTokenizer(regex_pattern=regex_pattern) | |
def vocab_size(self): | |
return len(self.vocab) | |
def vocab_list(self): | |
return list(self.vocab.keys()) | |
def _tokenize(self, text: str, max_seq_length: int = 512, **kwargs): | |
"""Tokenize a string into a list of tokens. | |
Parameters | |
---------- | |
text: str | |
Input string sequence to be tokenized. | |
""" | |
max_len_single_sentence = max_seq_length - 2 | |
split_tokens = [ | |
token for token in self.basic_tokenizer.tokenize(text) | |
[:max_len_single_sentence] | |
] | |
return split_tokens | |
def _convert_token_to_id(self, token: str): | |
"""Converts a token (str/unicode) in an id using the vocab. | |
Parameters | |
---------- | |
token: str | |
String token from a larger sequence to be converted to a numerical id. | |
""" | |
return self.vocab.get(token, self.vocab.get(self.unk_token)) | |
def _convert_id_to_token(self, index: int): | |
"""Converts an index (integer) in a token (string/unicode) using the vocab. | |
Parameters | |
---------- | |
index: int | |
Integer index to be converted back to a string-based token as part of a larger sequence. | |
""" | |
return self.ids_to_tokens.get(index, self.unk_token) | |
def convert_tokens_to_string(self, tokens: List[str]): | |
"""Converts a sequence of tokens (string) in a single string. | |
Parameters | |
---------- | |
tokens: List[str] | |
List of tokens for a given string sequence. | |
Returns | |
------- | |
out_string: str | |
Single string from combined tokens. | |
""" | |
out_string: str = " ".join(tokens).replace(" ##", "").strip() | |
return out_string | |
def add_special_tokens_ids_single_sequence(self, | |
token_ids: List[Optional[int]]): | |
"""Adds special tokens to a sequence for sequence classification tasks. | |
A BERT sequence has the following format: [CLS] X [SEP] | |
Parameters | |
---------- | |
token_ids: list[int] | |
list of tokenized input ids. Can be obtained using the encode or encode_plus methods. | |
""" | |
return [self.cls_token_id] + token_ids + [self.sep_token_id] | |
def add_special_tokens_single_sequence(self, tokens: List[str]): | |
"""Adds special tokens to the a sequence for sequence classification tasks. | |
A BERT sequence has the following format: [CLS] X [SEP] | |
Parameters | |
---------- | |
tokens: List[str] | |
List of tokens for a given string sequence. | |
""" | |
return [self.cls_token] + tokens + [self.sep_token] | |
def add_special_tokens_ids_sequence_pair( | |
self, token_ids_0: List[Optional[int]], | |
token_ids_1: List[Optional[int]]) -> List[Optional[int]]: | |
"""Adds special tokens to a sequence pair for sequence classification tasks. | |
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP] | |
Parameters | |
---------- | |
token_ids_0: List[int] | |
List of ids for the first string sequence in the sequence pair (A). | |
token_ids_1: List[int] | |
List of tokens for the second string sequence in the sequence pair (B). | |
""" | |
sep = [self.sep_token_id] | |
cls = [self.cls_token_id] | |
return cls + token_ids_0 + sep + token_ids_1 + sep | |
def add_padding_tokens(self, | |
token_ids: List[Optional[int]], | |
length: int, | |
right: bool = True) -> List[Optional[int]]: | |
"""Adds padding tokens to return a sequence of length max_length. | |
By default padding tokens are added to the right of the sequence. | |
Parameters | |
---------- | |
token_ids: list[optional[int]] | |
list of tokenized input ids. Can be obtained using the encode or encode_plus methods. | |
length: int | |
right: bool, default True | |
Returns | |
------- | |
List[int] | |
""" | |
padding = [self.pad_token_id] * (length - len(token_ids)) | |
if right: | |
return token_ids + padding | |
else: | |
return padding + token_ids | |
class BasicSmilesTokenizer(object): | |
""" | |
Adapted from https://github.com/deepchem/deepchem/. | |
Run basic SMILES tokenization using a regex pattern developed by Schwaller et. al. | |
This tokenizer is to be used when a tokenizer that does not require the transformers library by HuggingFace is required. | |
Examples | |
-------- | |
>>> tokenizer = BasicSmilesTokenizer() | |
>>> print(tokenizer.tokenize("CC(=O)OC1=CC=CC=C1C(=O)O")) | |
['C', 'C', '(', '=', 'O', ')', 'O', 'C', '1', '=', 'C', 'C', '=', 'C', 'C', '=', 'C', '1', 'C', '(', '=', 'O', ')', 'O'] | |
References | |
---------- | |
.. [1] Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee | |
ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction | |
1572-1583 DOI: 10.1021/acscentsci.9b00576 | |
""" | |
def __init__(self, regex_pattern: str = SMI_REGEX_PATTERN): | |
"""Constructs a BasicSMILESTokenizer. | |
Parameters | |
---------- | |
regex: string | |
SMILES token regex | |
""" | |
self.regex_pattern = regex_pattern | |
self.regex = re.compile(self.regex_pattern) | |
def tokenize(self, text): | |
"""Basic Tokenization of a SMILES. | |
""" | |
tokens = [token for token in self.regex.findall(text)] | |
return tokens | |
def load_vocab(vocab_file): | |
"""Loads a vocabulary file into a dictionary.""" | |
vocab = collections.OrderedDict() | |
with open(vocab_file, "r", encoding="utf-8") as reader: | |
tokens = reader.readlines() | |
for index, token in enumerate(tokens): | |
token = token.rstrip("\n") | |
vocab[token] = index | |
return vocab | |