## These MolStandardizer classes are due to Paolo Tosco ## It was taken from the FS-Mol github ## (https://github.com/microsoft/FS-Mol/blob/main/fs_mol/preprocessing/utils/ ## standardizer.py) ## They ensure that a sequence of standardization operations are applied ## https://gist.github.com/ptosco/7e6b9ab9cc3e44ba0919060beaed198e import os import pickle from rdkit import Chem from rdkit.Chem.MolStandardize import rdMolStandardize HF_TOKEN = os.environ.get("HF_TOKEN") PAD_VALUE = -100 TOX_SMARTS_PATH = "data/tox_smarts.json" TASKS = [ "NR-AR", "NR-AR-LBD", "NR-AhR", "NR-Aromatase", "NR-ER", "NR-ER-LBD", "NR-PPAR-gamma", "SR-ARE", "SR-ATAD5", "SR-HSE", "SR-MMP", "SR-p53", ] KNOWN_DESCR = ["ecfps", "rdkit_descr_quantiles", "maccs", "tox"] USED_200_DESCR = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, ] class Standardizer: """ Simple wrapper class around rdkit Standardizer. """ DEFAULT_CANON_TAUT = False DEFAULT_METAL_DISCONNECT = False MAX_TAUTOMERS = 100 MAX_TRANSFORMS = 100 MAX_RESTARTS = 200 PREFER_ORGANIC = True def __init__( self, metal_disconnect=None, canon_taut=None, ): """ Constructor. All parameters are optional. :param metal_disconnect: if True, metallorganic complexes are disconnected :param canon_taut: if True, molecules are converted to their canonical tautomer """ super().__init__() if metal_disconnect is None: metal_disconnect = self.DEFAULT_METAL_DISCONNECT if canon_taut is None: canon_taut = self.DEFAULT_CANON_TAUT self._canon_taut = canon_taut self._metal_disconnect = metal_disconnect self._taut_enumerator = None self._uncharger = None self._lfrag_chooser = None self._metal_disconnector = None self._normalizer = None self._reionizer = None self._params = None @property def params(self): """Return the MolStandardize CleanupParameters.""" if self._params is None: self._params = rdMolStandardize.CleanupParameters() self._params.maxTautomers = self.MAX_TAUTOMERS self._params.maxTransforms = self.MAX_TRANSFORMS self._params.maxRestarts = self.MAX_RESTARTS self._params.preferOrganic = self.PREFER_ORGANIC self._params.tautomerRemoveSp3Stereo = False return self._params @property def canon_taut(self): """Return whether tautomer canonicalization will be done.""" return self._canon_taut @property def metal_disconnect(self): """Return whether metallorganic complexes will be disconnected.""" return self._metal_disconnect @property def taut_enumerator(self): """Return the TautomerEnumerator object.""" if self._taut_enumerator is None: self._taut_enumerator = rdMolStandardize.TautomerEnumerator(self.params) return self._taut_enumerator @property def uncharger(self): """Return the Uncharger object.""" if self._uncharger is None: self._uncharger = rdMolStandardize.Uncharger() return self._uncharger @property def lfrag_chooser(self): """Return the LargestFragmentChooser object.""" if self._lfrag_chooser is None: self._lfrag_chooser = rdMolStandardize.LargestFragmentChooser( self.params.preferOrganic ) return self._lfrag_chooser @property def metal_disconnector(self): """Return the MetalDisconnector object.""" if self._metal_disconnector is None: self._metal_disconnector = rdMolStandardize.MetalDisconnector() return self._metal_disconnector @property def normalizer(self): """Return the Normalizer object.""" if self._normalizer is None: self._normalizer = rdMolStandardize.Normalizer( self.params.normalizationsFile, self.params.maxRestarts ) return self._normalizer @property def reionizer(self): """Return the Reionizer object.""" if self._reionizer is None: self._reionizer = rdMolStandardize.Reionizer(self.params.acidbaseFile) return self._reionizer def charge_parent(self, mol_in): """Sequentially apply a series of MolStandardize operations: * MetalDisconnector * Normalizer * Reionizer * LargestFragmentChooser * Uncharger The net result is that a desalted, normalized, neutral molecule with implicit Hs is returned. """ params = Chem.RemoveHsParameters() params.removeAndTrackIsotopes = True mol_in = Chem.RemoveHs(mol_in, params, sanitize=False) if self._metal_disconnect: mol_in = self.metal_disconnector.Disconnect(mol_in) normalized = self.normalizer.normalize(mol_in) Chem.SanitizeMol(normalized) normalized = self.reionizer.reionize(normalized) Chem.AssignStereochemistry(normalized) normalized = self.lfrag_chooser.choose(normalized) normalized = self.uncharger.uncharge(normalized) # need this to reassess aromaticity on things like # cyclopentadienyl, tropylium, azolium, etc. Chem.SanitizeMol(normalized) return Chem.RemoveHs(Chem.AddHs(normalized)) def standardize_mol(self, mol_in): """ Standardize a single molecule. :param mol_in: a Chem.Mol :return: * (standardized Chem.Mol, n_taut) tuple if success. n_taut will be negative if tautomer enumeration was aborted due to reaching a limit * (None, error_msg) if failure This calls self.charge_parent() and, if self._canon_taut is True, runs tautomer canonicalization. """ n_tautomers = 0 if isinstance(mol_in, Chem.Mol): name = None try: name = mol_in.GetProp("_Name") except KeyError: pass if not name: name = "NONAME" else: error = f"Expected SMILES or Chem.Mol as input, got {str(type(mol_in))}" return None, error try: mol_out = self.charge_parent(mol_in) except Exception as e: error = f"charge_parent FAILED: {str(e).strip()}" return None, error if self._canon_taut: try: res = self.taut_enumerator.Enumerate(mol_out, False) except TypeError: # we are still on the pre-2021 RDKit API res = self.taut_enumerator.Enumerate(mol_out) except Exception as e: # something else went wrong error = f"canon_taut FAILED: {str(e).strip()}" return None, error n_tautomers = len(res) if hasattr(res, "status"): completed = ( res.status == rdMolStandardize.TautomerEnumeratorStatus.Completed ) else: # we are still on the pre-2021 RDKit API completed = len(res) < 1000 if not completed: n_tautomers = -n_tautomers try: mol_out = self.taut_enumerator.PickCanonical(res) except AttributeError: # we are still on the pre-2021 RDKit API mol_out = max( [(self.taut_enumerator.ScoreTautomer(m), m) for m in res] )[1] except Exception as e: # something else went wrong error = f"canon_taut FAILED: {str(e).strip()}" return None, error mol_out.SetProp("_Name", name) return mol_out, n_tautomers def load_pickle(path: str): with open(path, "rb") as file: content = pickle.load(file) return content def write_pickle(path: str, obj: object): with open(path, "wb") as file: pickle.dump(obj, file)