molecular_conditional_generation / metric_calculator.py
feiyang-cai's picture
update
1d1d4f3
from sklearn.metrics import mean_squared_error, roc_auc_score, r2_score
from rdkit.Chem import QED, Crippen, MolFromSmiles, rdmolops, rdMolDescriptors, AllChem
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
import networkx as nx
import os.path as op
import math
#from rdkit.six.moves import cPickle
import _pickle as cPickle
#from rdkit.six import iteritems
from rdkit import Chem
import pickle
import numpy as np
import sys
import os
from rdkit.Chem import RDConfig
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
import sascorer
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
from rdkit.Chem.Fingerprints import FingerprintMols
def compute_rmse(gt, pred):
return mean_squared_error(gt, pred, squared=False)
def compute_r2score(gt, pred):
return r2_score(gt, pred)
def compute_roc_auc(gt, pred):
return roc_auc_score(gt, pred)
def check_valid(smiles_list):
total_num = len(smiles_list)
empty_num = smiles_list.count("")
return 1 - empty_num / float(total_num)
def check_unique(smiles_list):
total_num = len(smiles_list)
smiles_set = set(smiles_list)
if "" in smiles_set:
smiles_set.remove("")
return len(smiles_set) / float(total_num)
def check_nolvelty(gen_smiles, train_smiles):
if len(gen_smiles) == 0:
novel_ratio = 0.
else:
duplicates = [1 for mol in gen_smiles if mol in train_smiles]
novel = len(gen_smiles) - sum(duplicates)
novel_ratio = novel*100./len(gen_smiles)
return novel_ratio
_fscores = None
def readFragmentScores(name='fpscores'):
import gzip
global _fscores
# generate the full path filename:
if name == "fpscores":
name = op.join(op.dirname(__file__), name)
_fscores = cPickle.load(gzip.open('%s.pkl.gz'%name))
outDict = {}
for i in _fscores:
for j in range(1,len(i)):
outDict[i[j]] = float(i[0])
_fscores = outDict
def numBridgeheadsAndSpiro(mol,ri=None):
nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
return nBridgehead,nSpiro
def calculateScore(m):
if _fscores is None: readFragmentScores()
# fragment score
fp = rdMolDescriptors.GetMorganFingerprint(m,2) #<- 2 is the *radius* of the circular fingerprint
fps = fp.GetNonzeroElements()
score1 = 0.
nf = 0
for bitId,v in iteritems(fps):
nf += v
sfp = bitId
score1 += _fscores.get(sfp,-4)*v
score1 /= nf
# features score
nAtoms = m.GetNumAtoms()
nChiralCenters = len(Chem.FindMolChiralCenters(m,includeUnassigned=True))
ri = m.GetRingInfo()
nBridgeheads,nSpiro=numBridgeheadsAndSpiro(m,ri)
nMacrocycles=0
for x in ri.AtomRings():
if len(x)>8: nMacrocycles+=1
sizePenalty = nAtoms**1.005 - nAtoms
stereoPenalty = math.log10(nChiralCenters+1)
spiroPenalty = math.log10(nSpiro+1)
bridgePenalty = math.log10(nBridgeheads+1)
macrocyclePenalty = 0.
# ---------------------------------------
# This differs from the paper, which defines:
# macrocyclePenalty = math.log10(nMacrocycles+1)
# This form generates better results when 2 or more macrocycles are present
if nMacrocycles > 0: macrocyclePenalty = math.log10(2)
score2 = 0. -sizePenalty -stereoPenalty -spiroPenalty -bridgePenalty -macrocyclePenalty
# correction for the fingerprint density
# not in the original publication, added in version 1.1
# to make highly symmetrical molecules easier to synthetise
score3 = 0.
if nAtoms > len(fps):
score3 = math.log(float(nAtoms) / len(fps)) * .5
sascore = score1 + score2 + score3
# need to transform "raw" value into scale between 1 and 10
min = -4.0
max = 2.5
sascore = 11. - (sascore - min + 1) / (max - min) * 9.
# smooth the 10-end
if sascore > 8.: sascore = 8. + math.log(sascore+1.-9.)
if sascore > 10.: sascore = 10.0
elif sascore < 1.: sascore = 1.0
return sascore
def compute_plogp(mol):
#mol = MolFromSmiles(smiles_string)
#logp = (Crippen.MolLogP(mol) - np.mean(logP_values)) / np.std(logP_values)
logp = Crippen.MolLogP(mol)
#SA_score = (-sascorer.calculateScore(mol) - np.mean(SA_scores)) / np.std(SA_scores)
SA_score = -calculateScore(mol)
cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(mol)))
if len(cycle_list) == 0:
cycle_length = 0
else:
cycle_length = max([ len(j) for j in cycle_list ])
if cycle_length <= 6:
cycle_length = 0
else:
cycle_length = cycle_length - 6
#cycle_score = (-cycle_length - np.mean(cycle_scores)) / np.std(cycle_scores)
cycle_score = -cycle_length
#plogp = -(logp + SA_score + cycle_score)
plogp = (logp + SA_score + cycle_score)
return plogp
clf_model = None
def load_model():
global clf_model
#name = op.join(op.dirname(__file__), 'clf_py36.pkl')
name = op.join(op.dirname(__file__), 'drd2_current.pkl')
with open(name, "rb") as f:
clf_model = pickle.load(f)
def fingerprints_from_mol(mol):
fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True)
size = 2048
nfp = np.zeros((1, size), np.int32)
for idx,v in fp.GetNonzeroElements().items():
nidx = idx%size
nfp[0, nidx] += int(v)
return nfp
def compute_drd2(mol):
if clf_model is None:
load_model()
#print(smile)
#mol = Chem.MolFromSmiles(smile)
if mol:
fp = fingerprints_from_mol(mol)
score = clf_model.predict_proba(fp)[:, 1]
return float(score)
return 0.0
def compute_qed(mol):
return QED.qed(mol)
def compute_logp(mol):
return Crippen.MolLogP(mol)
def compute_tpsa(mol):
return rdMolDescriptors.CalcTPSA(mol)
def compute_sas(mol):
return sascorer.calculateScore(mol)
def check_valid_unique(smiles_list):
total_num = len(smiles_list)
empty_num = smiles_list.count("")
smiles_set = set(smiles_list)
if "" in smiles_set:
smiles_set.remove("")
return 1 - empty_num / float(total_num), \
len(smiles_set) / float(total_num - empty_num)
def get_similarity(smiles1, smiles2):
if smiles1 == "" or smiles2 == "":
return np.nan
sim = TanimotoSimilarity(FingerprintMols.FingerprintMol(Chem.MolFromSmiles(smiles1)),
FingerprintMols.FingerprintMol(Chem.MolFromSmiles(smiles2)))
return sim
def get_scaffold(smiles):
scaffold = MurckoScaffoldSmiles(smiles)
return scaffold