DeepSEQreen_fast_build

Running on CPU Upgrade

App Files Files Community

DeepSEQreen_fast_build / deepscreen /data /featurizers /token.py

libokj

Upload 110 files

c0ec7e6 about 1 year ago

raw

history blame

10.1 kB

	import collections
	from importlib import resources
	import os
	import re
	from typing import Optional, List

	import numpy as np
	from transformers import BertTokenizer

	SMI_REGEX_PATTERN = r"""(\[[^\]]+]\|Br?\|Cl?\|N\|O\|S\|P\|F\|I\|b\|c\|n\|o\|s\|p\|$\|$\|\.\|=\|#\|-\|\+\|\\\|\/\|:\|~\|@\|\?\|>>?\|\*\|\$\|\%[0-9]{2}\|[0-9])"""
	# \[[^\]]+\] # match anything inside square brackets
	# \|Br?\|Cl?\|N\|O\|S\|P\|F\|I\|b\|c\|n\|o\|s\|p # match elements
	# \|$\|$ # match parentheses
	# \|\.\|=\|#\|-\|\+\|\\\|\/\|:\|~\|@\|\?\|>>?\|\*\|\$\|\%[0-9]{2} # match various symbols
	# \|[0-9] # match digits


	def sequence_to_kmers(sequence, k=3):
	""" Divide a string into a list of kmers strings.

	Parameters:
	sequence (string)
	k (int), default 3
	Returns:
	List containing a list of kmers.
	"""
	return [sequence[i:i + k] for i in range(len(sequence) - k + 1)]


	def sequence_to_word_embedding(sequence, model):
	"""Get protein embedding, infer a list of 3-mers to (num_word, 100) matrix"""
	kmers = sequence_to_kmers(sequence)
	vec = np.zeros((len(kmers), 100))
	i = 0
	for word in kmers:
	try:
	vec[i,] = model.wv[word]
	except KeyError:
	pass
	i += 1
	return vec


	def sequence_to_token_ids(sequence, tokenizer):
	token_ids = tokenizer.encode(sequence)
	return np.array(token_ids)


	# def sequence_to_token_ids(sequence, tokenizer, max_length: int):
	# token_ids = tokenizer.encode(sequence)
	# length = min(max_length, len(token_ids))
	#
	# token_ids_padded = np.zeros(max_length, dtype='int')
	# token_ids_padded[:length] = token_ids[:length]
	#
	# return token_ids_padded


	class SmilesTokenizer(BertTokenizer):
	"""
	Adapted from https://github.com/deepchem/deepchem/.

	Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer
	implementation found in Huggingface's transformers library. It runs a WordPiece tokenization
	algorithm over SMILES strings using the tokenization SMILES regex developed by Schwaller et al.

	Please see https://github.com/huggingface/transformers
	and https://github.com/rxn4chemistry/rxnfp for more details.

	Examples
	--------
	>>> tokenizer = SmilesTokenizer(vocab_path, regex_pattern)
	>>> print(tokenizer.encode("CC(=O)OC1=CC=CC=C1C(=O)O"))
	[12, 16, 16, 17, 22, 19, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 16, 17, 22, 19, 18, 19, 13]


	References
	----------
	.. [1] Schwaller, Philippe; Probst, Daniel; Vaucher, Alain C.; Nair, Vishnu H; Kreutter, David;
	Laino, Teodoro; et al. (2019): Mapping the Space of Chemical Reactions using Attention-Based Neural
	Networks. ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.9897365.v3

	Note
	----
	This class requires huggingface's transformers and tokenizers libraries to be installed.
	"""

	def __init__(
	self,
	vocab_file: str = 'resources/vocabs/smiles.txt',
	regex_pattern: str = SMI_REGEX_PATTERN,
	# unk_token="[UNK]",
	# sep_token="[SEP]",
	# pad_token="[PAD]",
	# cls_token="[CLS]",
	# mask_token="[MASK]",
	**kwargs):
	"""Constructs a SmilesTokenizer.

	Parameters
	----------
	vocab_file: str
	Path to a SMILES character per line vocabulary file.
	Default vocab file is found in deepchem/feat/tests/data/vocab.txt
	"""

	super().__init__(vocab_file, **kwargs)

	if not os.path.isfile(vocab_file):
	raise ValueError(
	"Can't find a vocab file at path '{}'.".format(vocab_file))
	self.vocab = load_vocab(vocab_file)
	unused_indexes = [i for i, v in enumerate(self.vocab.keys()) if v.startswith("[unused")]
	self.highest_unused_index = 0 if len(unused_indexes) == 0 else max(unused_indexes)
	self.ids_to_tokens = collections.OrderedDict([
	(ids, tok) for tok, ids in self.vocab.items()
	])
	self.basic_tokenizer = BasicSmilesTokenizer(regex_pattern=regex_pattern)

	@property
	def vocab_size(self):
	return len(self.vocab)

	@property
	def vocab_list(self):
	return list(self.vocab.keys())

	def _tokenize(self, text: str, max_seq_length: int = 512, **kwargs):
	"""Tokenize a string into a list of tokens.

	Parameters
	----------
	text: str
	Input string sequence to be tokenized.
	"""

	max_len_single_sentence = max_seq_length - 2
	split_tokens = [
	token for token in self.basic_tokenizer.tokenize(text)
	[:max_len_single_sentence]
	]
	return split_tokens

	def _convert_token_to_id(self, token: str):
	"""Converts a token (str/unicode) in an id using the vocab.

	Parameters
	----------
	token: str
	String token from a larger sequence to be converted to a numerical id.
	"""

	return self.vocab.get(token, self.vocab.get(self.unk_token))

	def _convert_id_to_token(self, index: int):
	"""Converts an index (integer) in a token (string/unicode) using the vocab.

	Parameters
	----------
	index: int
	Integer index to be converted back to a string-based token as part of a larger sequence.
	"""

	return self.ids_to_tokens.get(index, self.unk_token)

	def convert_tokens_to_string(self, tokens: List[str]):
	"""Converts a sequence of tokens (string) in a single string.

	Parameters
	----------
	tokens: List[str]
	List of tokens for a given string sequence.

	Returns
	-------
	out_string: str
	Single string from combined tokens.
	"""

	out_string: str = " ".join(tokens).replace(" ##", "").strip()
	return out_string

	def add_special_tokens_ids_single_sequence(self,
	token_ids: List[Optional[int]]):
	"""Adds special tokens to a sequence for sequence classification tasks.

	A BERT sequence has the following format: [CLS] X [SEP]

	Parameters
	----------
	token_ids: list[int]
	list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
	"""

	return [self.cls_token_id] + token_ids + [self.sep_token_id]

	def add_special_tokens_single_sequence(self, tokens: List[str]):
	"""Adds special tokens to the a sequence for sequence classification tasks.
	A BERT sequence has the following format: [CLS] X [SEP]

	Parameters
	----------
	tokens: List[str]
	List of tokens for a given string sequence.
	"""
	return [self.cls_token] + tokens + [self.sep_token]

	def add_special_tokens_ids_sequence_pair(
	self, token_ids_0: List[Optional[int]],
	token_ids_1: List[Optional[int]]) -> List[Optional[int]]:
	"""Adds special tokens to a sequence pair for sequence classification tasks.
	A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]

	Parameters
	----------
	token_ids_0: List[int]
	List of ids for the first string sequence in the sequence pair (A).
	token_ids_1: List[int]
	List of tokens for the second string sequence in the sequence pair (B).
	"""

	sep = [self.sep_token_id]
	cls = [self.cls_token_id]

	return cls + token_ids_0 + sep + token_ids_1 + sep

	def add_padding_tokens(self,
	token_ids: List[Optional[int]],
	length: int,
	right: bool = True) -> List[Optional[int]]:
	"""Adds padding tokens to return a sequence of length max_length.
	By default padding tokens are added to the right of the sequence.

	Parameters
	----------
	token_ids: list[optional[int]]
	list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
	length: int
	right: bool, default True

	Returns
	-------
	List[int]
	"""
	padding = [self.pad_token_id] * (length - len(token_ids))

	if right:
	return token_ids + padding
	else:
	return padding + token_ids


	class BasicSmilesTokenizer(object):
	"""
	Adapted from https://github.com/deepchem/deepchem/.
	Run basic SMILES tokenization using a regex pattern developed by Schwaller et. al.
	This tokenizer is to be used when a tokenizer that does not require the transformers library by HuggingFace is required.

	Examples
	--------
	>>> tokenizer = BasicSmilesTokenizer()
	>>> print(tokenizer.tokenize("CC(=O)OC1=CC=CC=C1C(=O)O"))
	['C', 'C', '(', '=', 'O', ')', 'O', 'C', '1', '=', 'C', 'C', '=', 'C', 'C', '=', 'C', '1', 'C', '(', '=', 'O', ')', 'O']


	References
	----------
	.. [1] Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
	ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
	1572-1583 DOI: 10.1021/acscentsci.9b00576
	"""

	def __init__(self, regex_pattern: str = SMI_REGEX_PATTERN):
	"""Constructs a BasicSMILESTokenizer.

	Parameters
	----------
	regex: string
	SMILES token regex
	"""
	self.regex_pattern = regex_pattern
	self.regex = re.compile(self.regex_pattern)

	def tokenize(self, text):
	"""Basic Tokenization of a SMILES.
	"""
	tokens = [token for token in self.regex.findall(text)]
	return tokens


	def load_vocab(vocab_file):
	"""Loads a vocabulary file into a dictionary."""
	vocab = collections.OrderedDict()
	with open(vocab_file, "r", encoding="utf-8") as reader:
	tokens = reader.readlines()
	for index, token in enumerate(tokens):
	token = token.rstrip("\n")
	vocab[token] = index
	return vocab