|
|
from rdkit import Chem
|
|
|
from rdkit.Chem import Draw
|
|
|
from rdkit.Chem import AllChem, MACCSkeys, rdMolDescriptors as rdDesc
|
|
|
from collections import defaultdict
|
|
|
import numpy as np
|
|
|
import os, pickle, hashlib
|
|
|
|
|
|
AllChem.SetPreferCoordGen(True)
|
|
|
|
|
|
FINGERPRINT_DICT = defaultdict(lambda : len(FINGERPRINT_DICT))
|
|
|
|
|
|
ELEMENTS = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al',
|
|
|
'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn',
|
|
|
'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb',
|
|
|
'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In',
|
|
|
'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm',
|
|
|
'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta',
|
|
|
'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At',
|
|
|
'Rn', 'Fr', 'Ra', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk',
|
|
|
'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt',
|
|
|
'Ds', 'Rg', 'Cn', 'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og']
|
|
|
|
|
|
for e in ELEMENTS:
|
|
|
FINGERPRINT_DICT[e]
|
|
|
|
|
|
if os.path.exists('rdkit_fingerprint_list_r1.pkl'):
|
|
|
l = pickle.load(open('rdkit_fingerprint_list_r1.pkl', 'rb'))
|
|
|
|
|
|
for smi in l:
|
|
|
FINGERPRINT_DICT[smi]
|
|
|
|
|
|
print('Len fingerprint_list: %s' %len(FINGERPRINT_DICT)) + len(ELEMENTS)
|
|
|
|
|
|
def mol_with_atom_index(mol):
|
|
|
atoms = mol.GetNumAtoms()
|
|
|
for idx in range(atoms):
|
|
|
mol.GetAtomWithIdx(idx).SetProp('molAtomMapNumber', str(mol.GetAtomWithIdx(idx).GetIdx()))
|
|
|
return mol
|
|
|
|
|
|
def prepare_mol_for_drawing(mol):
|
|
|
try:
|
|
|
mol_draw = Draw.rdMolDraw2D.PrepareMolForDrawing(mol)
|
|
|
except Chem.KekulizeException:
|
|
|
mol_draw = Draw.rdMolDraw2D.PrepareMolForDrawing(mol, kekulize=False)
|
|
|
Chem.SanitizeMol(mol_draw, Chem.SANITIZE_ALL ^ Chem.SANITIZE_KEKULIZE)
|
|
|
return mol_draw
|
|
|
|
|
|
def get_atom_submol_radn(mol, radius, sanitize=True):
|
|
|
atoms = []
|
|
|
submols = []
|
|
|
|
|
|
for atom in mol.GetAtoms():
|
|
|
atoms.append(atom)
|
|
|
r = radius
|
|
|
while r > 0:
|
|
|
try:
|
|
|
env = Chem.FindAtomEnvironmentOfRadiusN(mol, r, atom.GetIdx())
|
|
|
amap={}
|
|
|
submol = Chem.PathToSubmol(mol, env, atomMap=amap)
|
|
|
if sanitize:
|
|
|
Chem.SanitizeMol(submol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL^Chem.SanitizeFlags.SANITIZE_KEKULIZE)
|
|
|
|
|
|
submols.append(submol)
|
|
|
break
|
|
|
except Exception as e:
|
|
|
print(64, e)
|
|
|
r -= 1
|
|
|
|
|
|
return atoms, submols
|
|
|
|
|
|
def gen_fps_from_mol(mol, nbits=256, use_morgan=True, use_macc=False, use_rdkit=False):
|
|
|
|
|
|
fp = []
|
|
|
if use_morgan:
|
|
|
fp_vec = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=nbits)
|
|
|
fp1 = np.frombuffer(fp_vec.ToBitString().encode(), 'u1') - ord('0')
|
|
|
fp = fp1.tolist()
|
|
|
if use_macc:
|
|
|
|
|
|
fp_vec = MACCSkeys.GenMACCSKeys(mol)
|
|
|
fp1 = np.frombuffer(fp_vec.ToBitString().encode(), 'u1') - ord('0')
|
|
|
fp.extend(fp1.tolist())
|
|
|
if use_rdkit:
|
|
|
fp_vec = Chem.RDKFingerprint(mol)
|
|
|
fp1 = np.frombuffer(fp_vec.ToBitString().encode(), 'u1') - ord('0')
|
|
|
fp.extend(fp1.tolist())
|
|
|
|
|
|
return fp
|
|
|
|
|
|
def gen_subgraph_fps_from_str(s, wordsdict={}):
|
|
|
if s in wordsdict:
|
|
|
return [wordsdict[s]]
|
|
|
else:
|
|
|
return [len(wordsdict)]
|
|
|
|
|
|
def gen_subgraph_fps_from_mol(mol, wordsdict={}):
|
|
|
try:
|
|
|
k = Chem.MolToSmiles(mol)
|
|
|
return gen_subgraph_fps_from_str(k, wordsdict)
|
|
|
except Exception as e:
|
|
|
print(e)
|
|
|
return [len(wordsdict)]
|
|
|
|
|
|
def calc_subgraph_fps_from_mol(mol, radius=2, nbits=128, use_macc=True, fptype=1, wordsdict={}):
|
|
|
|
|
|
atoms, submols = get_atom_submol_radn(mol, radius, True)
|
|
|
feats = []
|
|
|
for idx, submol in enumerate(submols):
|
|
|
if fptype == 1:
|
|
|
feat = gen_fps_from_mol(submol, nbits, use_macc)
|
|
|
feats.append(feat)
|
|
|
elif fptype == 2:
|
|
|
feat = gen_subgraph_fps_from_mol(submol, wordsdict)
|
|
|
feats.append(feat)
|
|
|
|
|
|
return np.array(feats)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
smi = 'C=C(S)C(N)(O)C'
|
|
|
smi = 'CC1CCN(CC1N(C)C2=NC=NC3=C2C=CN3)C(=O)CC#N'
|
|
|
|
|
|
mol = Chem.MolFromSmiles(smi, sanitize=False)
|
|
|
|
|
|
print(calc_subgraph_fps_from_mol(mol, 3))
|
|
|
|
|
|
mol = mol_with_atom_index(mol)
|
|
|
submols = get_atom_submol_radn(mol, 3)
|
|
|
submols = [prepare_mol_for_drawing(m) for m in submols]
|
|
|
hl = []
|
|
|
for idx, m in enumerate(submols):
|
|
|
for a in m.GetAtoms():
|
|
|
if int(a.GetProp('molAtomMapNumber')) == idx:
|
|
|
hl.append([a.GetIdx()])
|
|
|
break
|
|
|
|
|
|
draw = Draw.MolsToGridImage([mol] + submols, highlightAtomLists=[[]] + hl, molsPerRow=5)
|
|
|
draw.show()
|
|
|
|
|
|
|