import csv import networkx as nx from joblib import Parallel, delayed from rdkit import Chem from rdkit.Chem import AllChem from src.delinker_utils import sascorer def read_triples_file(filename): '''Reads .smi file ''' '''Returns array containing smiles strings of molecules''' smiles, names = [], [] with open(filename, 'r') as f: for line in f: if line: smiles.append(line.strip().split(' ')[0:3]) return smiles def remove_dummys(smi_string): return Chem.MolToSmiles(Chem.RemoveHs(AllChem.ReplaceSubstructs(Chem.MolFromSmiles(smi_string),Chem.MolFromSmiles('*'),Chem.MolFromSmiles('[H]'),True)[0])) def sa_filter(results, verbose=True): count = 0 total = 0 for processed, res in enumerate(results): total += len(res) for m in res: # Check SA score has improved if calc_mol_props(m[1])[1] < calc_mol_props(m[0])[1]: count += 1 # Progress if verbose: if processed % 10 == 0: print("\rProcessed %d" % processed, end="") print("\r",end="") return count/total def ring_check_res(res, clean_frag): check = True gen_mol = Chem.MolFromSmiles(res[1]) linker = Chem.DeleteSubstructs(gen_mol, clean_frag) # Get linker rings ssr = Chem.GetSymmSSSR(linker) # Check rings for ring in ssr: for atom_idx in ring: for bond in linker.GetAtomWithIdx(atom_idx).GetBonds(): if bond.GetBondType() == 2 and bond.GetBeginAtomIdx() in ring and bond.GetEndAtomIdx() in ring: check = False return check def ring_filter(results, verbose=True): count = 0 total = 0 du = Chem.MolFromSmiles('*') for processed, res in enumerate(results): total += len(res) for m in res: # Clean frags clean_frag = Chem.RemoveHs(AllChem.ReplaceSubstructs(Chem.MolFromSmiles(m[0]),du,Chem.MolFromSmiles('[H]'),True)[0]) if ring_check_res(m, clean_frag): count += 1 # Progress if verbose: if processed % 10 == 0: print("\rProcessed %d" % processed, end="") print("\r",end="") return count/total def check_ring_filter(linker): check = True # Get linker rings ssr = Chem.GetSymmSSSR(linker) # Check rings for ring in ssr: for atom_idx in ring: for bond in linker.GetAtomWithIdx(atom_idx).GetBonds(): if bond.GetBondType() == 2 and bond.GetBeginAtomIdx() in ring and bond.GetEndAtomIdx() in ring: check = False return check def check_pains(mol, pains_smarts): for pain in pains_smarts: if mol.HasSubstructMatch(pain): return False return True def calc_2d_filters(toks, pains_smarts): try: # Input format: (Full Molecule (SMILES), Linker (SMILES), Unlinked Fragments (SMILES)) frags = Chem.MolFromSmiles(toks[2]) linker = Chem.MolFromSmiles(toks[1]) full_mol = Chem.MolFromSmiles(toks[0]) # Remove dummy atoms from unlinked fragments du = Chem.MolFromSmiles('*') clean_frag = Chem.RemoveHs(AllChem.ReplaceSubstructs(frags, du, Chem.MolFromSmiles('[H]'), True)[0]) res = [] # Check: Unlinked fragments in full molecule if len(full_mol.GetSubstructMatch(clean_frag)) > 0: # Check: SA score improved from unlinked fragments to full molecule if calc_sa_score_mol(full_mol) < calc_sa_score_mol(frags): res.append(True) else: res.append(False) # Check: No non-aromatic rings with double bonds if check_ring_filter(linker): res.append(True) else: res.append(False) # Check: Pass pains filters if check_pains(full_mol, pains_smarts): res.append(True) else: res.append(False) return res except: return [False, False, False] def calc_filters_2d_dataset(results, pains_smarts_loc, n_cores=1): # Load pains filters with open(pains_smarts_loc, 'r') as f: pains_smarts = [Chem.MolFromSmarts(line[0], mergeHs=True) for line in csv.reader(f)] # calc_2d_filters([results[0][2], results[0][4], results[0][1]], pains_smarts) with Parallel(n_jobs=n_cores, backend='multiprocessing') as parallel: filters_2d = parallel(delayed(calc_2d_filters)([toks[2], toks[4], toks[1]], pains_smarts) for toks in results) return filters_2d def calc_mol_props(smiles): # Create RDKit mol mol = Chem.MolFromSmiles(smiles) if mol is None: print("Error passing: %s" % smiles) return None # QED qed = Chem.QED.qed(mol) # Synthetic accessibility score - number of cycles (rings with > 6 atoms) sas = sascorer.calculateScore(mol) # Cyles with >6 atoms ri = mol.GetRingInfo() nMacrocycles = 0 for x in ri.AtomRings(): if len(x) > 6: nMacrocycles += 1 prop_array = [qed, sas] return prop_array def calc_sa_score_mol(mol, verbose=False): if mol is None: if verbose: print("Error passing: %s" % mol) return None # Synthetic accessibility score return sascorer.calculateScore(mol) def get_linker(full_mol, clean_frag, starting_point): # INPUT FORMAT: molecule (RDKit mol object), clean fragments (RDKit mol object), starting fragments (SMILES) # Get matches of fragments matches = list(full_mol.GetSubstructMatches(clean_frag)) # If no matches, terminate if len(matches) == 0: print("No matches") return "" # Get number of atoms in linker linker_len = full_mol.GetNumHeavyAtoms() - clean_frag.GetNumHeavyAtoms() if linker_len == 0: return "" # Setup mol_to_break = Chem.Mol(full_mol) Chem.Kekulize(full_mol, clearAromaticFlags=True) poss_linker = [] if len(matches) > 0: # Loop over matches for match in matches: mol_rw = Chem.RWMol(full_mol) # Get linker atoms linker_atoms = list(set(list(range(full_mol.GetNumHeavyAtoms()))).difference(match)) linker_bonds = [] atoms_joined_to_linker = [] # Loop over starting fragments atoms # Get (i) bonds between starting fragments and linker, (ii) atoms joined to linker for idx_to_delete in sorted(match, reverse=True): nei = [x.GetIdx() for x in mol_rw.GetAtomWithIdx(idx_to_delete).GetNeighbors()] intersect = set(nei).intersection(set(linker_atoms)) if len(intersect) == 1: linker_bonds.append(mol_rw.GetBondBetweenAtoms(idx_to_delete, list(intersect)[0]).GetIdx()) atoms_joined_to_linker.append(idx_to_delete) elif len(intersect) > 1: for idx_nei in list(intersect): linker_bonds.append(mol_rw.GetBondBetweenAtoms(idx_to_delete, idx_nei).GetIdx()) atoms_joined_to_linker.append(idx_to_delete) # Check number of atoms joined to linker # If not == 2, check next match if len(set(atoms_joined_to_linker)) != 2: continue # Delete starting fragments atoms for idx_to_delete in sorted(match, reverse=True): mol_rw.RemoveAtom(idx_to_delete) linker = Chem.Mol(mol_rw) # Check linker required num atoms if linker.GetNumHeavyAtoms() == linker_len: mol_rw = Chem.RWMol(full_mol) # Delete linker atoms for idx_to_delete in sorted(linker_atoms, reverse=True): mol_rw.RemoveAtom(idx_to_delete) frags = Chem.Mol(mol_rw) # Check there are two disconnected fragments if len(Chem.rdmolops.GetMolFrags(frags)) == 2: # Fragment molecule into starting fragments and linker fragmented_mol = Chem.FragmentOnBonds(mol_to_break, linker_bonds) # Remove starting fragments from fragmentation linker_to_return = Chem.Mol(fragmented_mol) qp = Chem.AdjustQueryParameters() qp.makeDummiesQueries = True for f in starting_point.split('.'): qfrag = Chem.AdjustQueryProperties(Chem.MolFromSmiles(f), qp) linker_to_return = AllChem.DeleteSubstructs(linker_to_return, qfrag, onlyFrags=True) # Check linker is connected and two bonds to outside molecule if len(Chem.rdmolops.GetMolFrags(linker)) == 1 and len(linker_bonds) == 2: Chem.Kekulize(linker_to_return, clearAromaticFlags=True) # If for some reason a starting fragment isn't removed (and it's larger than the linker), remove (happens v. occassionally) if len(Chem.rdmolops.GetMolFrags(linker_to_return)) > 1: for frag in Chem.MolToSmiles(linker_to_return).split('.'): if Chem.MolFromSmiles(frag).GetNumHeavyAtoms() == linker_len: return frag return Chem.MolToSmiles(Chem.MolFromSmiles(Chem.MolToSmiles(linker_to_return))) # If not, add to possible linkers (above doesn't capture some complex cases) else: fragmented_mol = Chem.MolFromSmiles(Chem.MolToSmiles(fragmented_mol), sanitize=False) linker_to_return = AllChem.DeleteSubstructs(fragmented_mol, Chem.MolFromSmiles(starting_point)) poss_linker.append(Chem.MolToSmiles(linker_to_return)) # If only one possibility, return linker if len(poss_linker) == 1: return poss_linker[0] # If no possibilities, process failed elif len(poss_linker) == 0: print("FAIL:", Chem.MolToSmiles(full_mol), Chem.MolToSmiles(clean_frag), starting_point) return "" # If multiple possibilities, process probably failed else: print("More than one poss linker. ", poss_linker) return poss_linker[0] def get_linker_v2(full_mol, clean_frag): # INPUT FORMAT: molecule (RDKit mol object), clean fragments (RDKit mol object), starting fragments (SMILES) # Get matches of fragments matches = list(full_mol.GetSubstructMatches(clean_frag)) # If no matches, terminate if len(matches) == 0: print("No matches") return "" # Get number of atoms in linker linker_len = full_mol.GetNumHeavyAtoms() - clean_frag.GetNumHeavyAtoms() if linker_len == 0: return "" # Setup mol_to_break = Chem.Mol(full_mol) Chem.Kekulize(full_mol, clearAromaticFlags=True) poss_linker = [] if len(matches) > 0: # Loop over matches for match in matches: mol_rw = Chem.RWMol(full_mol) # Get linker atoms linker_atoms = list(set(list(range(full_mol.GetNumHeavyAtoms()))).difference(match)) linker_bonds = [] atoms_joined_to_linker = [] # Loop over starting fragments atoms # Get (i) bonds between starting fragments and linker, (ii) atoms joined to linker for idx_to_delete in sorted(match, reverse=True): nei = [x.GetIdx() for x in mol_rw.GetAtomWithIdx(idx_to_delete).GetNeighbors()] intersect = set(nei).intersection(set(linker_atoms)) if len(intersect) == 1: linker_bonds.append(mol_rw.GetBondBetweenAtoms(idx_to_delete, list(intersect)[0]).GetIdx()) atoms_joined_to_linker.append(idx_to_delete) elif len(intersect) > 1: for idx_nei in list(intersect): linker_bonds.append(mol_rw.GetBondBetweenAtoms(idx_to_delete, idx_nei).GetIdx()) atoms_joined_to_linker.append(idx_to_delete) # Check number of atoms joined to linker # If not == 2, check next match if len(set(atoms_joined_to_linker)) != 2: continue # Delete starting fragments atoms for idx_to_delete in sorted(match, reverse=True): mol_rw.RemoveAtom(idx_to_delete) linker = Chem.Mol(mol_rw) # Check linker required num atoms if linker.GetNumHeavyAtoms() == linker_len: mol_rw = Chem.RWMol(full_mol) # Delete linker atoms for idx_to_delete in sorted(linker_atoms, reverse=True): mol_rw.RemoveAtom(idx_to_delete) frags = Chem.Mol(mol_rw) # Check linker is connected and two bonds to outside molecule if len(Chem.rdmolops.GetMolFrags(linker)) == 1 and len(linker_bonds) == 2: Chem.Kekulize(linker, clearAromaticFlags=True) # If for some reason a starting fragment isn't removed (and it's larger than the linker), remove (happens v. occassionally) if len(Chem.rdmolops.GetMolFrags(linker)) > 1: for frag in Chem.MolToSmiles(linker).split('.'): if Chem.MolFromSmiles(frag).GetNumHeavyAtoms() == linker_len: return frag return Chem.MolToSmiles(Chem.MolFromSmiles(Chem.MolToSmiles(linker))) # If not, add to possible linkers (above doesn't capture some complex cases) else: poss_linker.append(Chem.MolToSmiles(linker)) # If only one possibility, return linker if len(poss_linker) == 1: return poss_linker[0] # If no possibilities, process failed elif len(poss_linker) == 0: print("FAIL:", Chem.MolToSmiles(full_mol), Chem.MolToSmiles(clean_frag)) return "" # If multiple possibilities, process probably failed else: print("More than one poss linker. ", poss_linker) return poss_linker[0] def unique(results): total_dupes = 0 total = 0 for res in results: original_num = len(res) test_data = set(res) new_num = len(test_data) total_dupes += original_num - new_num total += original_num return 1 - total_dupes/float(total) def check_recovered_original_mol_with_idx(results): outcomes = [] rec_idx = [] for res in results: success = False # Load original mol and canonicalise orig_mol = Chem.MolFromSmiles(res[0][0][0]) Chem.RemoveStereochemistry(orig_mol) orig_mol = Chem.MolToSmiles(Chem.RemoveHs(orig_mol)) #orig_mol = MolStandardize.canonicalize_tautomer_smiles(orig_mol) # Check generated mols for m in res: # print(1) gen_mol = Chem.MolFromSmiles(m[0][2]) Chem.RemoveStereochemistry(gen_mol) gen_mol = Chem.MolToSmiles(Chem.RemoveHs(gen_mol)) #gen_mol = MolStandardize.canonicalize_tautomer_smiles(gen_mol) if gen_mol == orig_mol: # outcomes.append(True) success = True rec_idx.append(m[1]) # break if not success: outcomes.append(False) else: outcomes.append(True) return outcomes, rec_idx def topology_from_rdkit(rdkit_molecule): topology = nx.Graph() for atom in rdkit_molecule.GetAtoms(): # Add the atoms as nodes topology.add_node(atom.GetIdx(), atom_type=atom.GetAtomicNum()) # Add the bonds as edges for bond in rdkit_molecule.GetBonds(): topology.add_edge(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), bond_type=bond.GetBondType()) return topology