# # calculation of synthetic accessibility score as described in: # # Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions # Peter Ertl and Ansgar Schuffenhauer # Journal of Cheminformatics 1:8 (2009) # http://www.jcheminf.com/content/1/1/8 # # several small modifications to the original paper are included # particularly slightly different formula for marocyclic penalty # and taking into account also molecule symmetry (fingerprint density) # # for a set of 10k diverse molecules the agreement between the original method # as implemented in PipelinePilot and this implementation is r2 = 0.97 # # peter ertl & greg landrum, september 2013 # from __future__ import print_function from rdkit import Chem from rdkit.Chem import rdMolDescriptors from rdkit.six.moves import cPickle from rdkit.six import iteritems import math from collections import defaultdict import os.path as op _fscores = None def readFragmentScores(name='models/fpscores'): import gzip global _fscores # generate the full path filename: if name == "fpscores": name = op.join(op.dirname(__file__), name) _fscores = cPickle.load(gzip.open('%s.pkl.gz' % name)) outDict = {} for i in _fscores: for j in range(1, len(i)): outDict[i[j]] = float(i[0]) _fscores = outDict def numBridgeheadsAndSpiro(mol, ri=None): nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol) nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol) return nBridgehead, nSpiro def calculateScore(m): if _fscores is None: readFragmentScores() # fragment score fp = rdMolDescriptors.GetMorganFingerprint(m, 2) #<- 2 is the *radius* of the circular fingerprint fps = fp.GetNonzeroElements() score1 = 0. nf = 0 for bitId, v in iteritems(fps): nf += v sfp = bitId score1 += _fscores.get(sfp, -4) * v score1 /= nf # features score nAtoms = m.GetNumAtoms() nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True)) ri = m.GetRingInfo() nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri) nMacrocycles = 0 for x in ri.AtomRings(): if len(x) > 8: nMacrocycles += 1 sizePenalty = nAtoms**1.005 - nAtoms stereoPenalty = math.log10(nChiralCenters + 1) spiroPenalty = math.log10(nSpiro + 1) bridgePenalty = math.log10(nBridgeheads + 1) macrocyclePenalty = 0. # --------------------------------------- # This differs from the paper, which defines: # macrocyclePenalty = math.log10(nMacrocycles+1) # This form generates better results when 2 or more macrocycles are present if nMacrocycles > 0: macrocyclePenalty = math.log10(2) score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty # correction for the fingerprint density # not in the original publication, added in version 1.1 # to make highly symmetrical molecules easier to synthetise score3 = 0. if nAtoms > len(fps): score3 = math.log(float(nAtoms) / len(fps)) * .5 sascore = score1 + score2 + score3 # need to transform "raw" value into scale between 1 and 10 min = -4.0 max = 2.5 sascore = 11. - (sascore - min + 1) / (max - min) * 9. # smooth the 10-end if sascore > 8.: sascore = 8. + math.log(sascore + 1. - 9.) if sascore > 10.: sascore = 10.0 elif sascore < 1.: sascore = 1.0 return sascore def processMols(mols): print('smiles\tName\tsa_score') for i, m in enumerate(mols): if m is None: continue s = calculateScore(m) smiles = Chem.MolToSmiles(m) print(smiles + "\t" + m.GetProp('_Name') + "\t%3f" % s) if __name__ == '__main__': import sys, time t1 = time.time() readFragmentScores("fpscores") t2 = time.time() suppl = Chem.SmilesMolSupplier(sys.argv[1]) t3 = time.time() processMols(suppl) t4 = time.time() print('Reading took %.2f seconds. Calculating took %.2f seconds' % ((t2 - t1), (t4 - t3)), file=sys.stderr) # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # * Neither the name of Novartis Institutes for BioMedical Research Inc. # nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #