import logging import csv import re import platform import io from subprocess import Popen, PIPE from openbabel import openbabel import pdb from rdkit.Chem import rdchem if platform.system() == 'Windows': CXCALC_BIN = 'C:\\Users\\vuu10\\AppData\\Local\\Programs\\ChemAxon\\MarvinSuite\\bin\\cxcalc.exe' #CXCALC_BIN = 'C:\\Program Files (x86)\\ChemAxon\\MarvinBeans\\bin\\cxcalc.bat' use_shell_for_echo = True else: CXCALC_BIN = 'cxcalc' use_shell_for_echo = False MID_PH = 7.0 N_PKAS = 20 class ChemAxonError(Exception): pass def RunCxcalc(molstring, args): # pdb.set_trace() # with open(platform.DEV_NULL, 'w') as dev_null: try: logging.debug("INPUT: echo %s | %s" % (molstring, ' '.join([CXCALC_BIN] + args))) p1 = Popen(["echo", molstring], stdout=PIPE, shell=use_shell_for_echo) # p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout, # executable=CXCALC_BIN, stdout=PIPE, stderr=dev_null, shell=False) p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout, executable=CXCALC_BIN, stdout=PIPE, shell=False) # p.wait() # os.remove(temp_fname) res = p2.communicate()[0] if p2.returncode != 0: raise ChemAxonError(str(args)) logging.debug("OUTPUT: %s" % res) res = res.decode('utf-8') return res except OSError: raise Exception( "Marvin (by ChemAxon) must be installed to calculate pKa data.") def ParsePkaOutput(s, n_acidic, n_basic): """ Returns: A dictionary that maps the atom index to a list of pKas that are assigned to that atom. """ # s = s.decode('utf-8') atom2pKa = {} pkaline = s.split('\n')[1] splitline = pkaline.split('\t') splitline.pop(0) if n_acidic + n_basic > 0: if len(splitline) != (n_acidic + n_basic + 2): raise ChemAxonError('ChemAxon failed to find any pKas') pKa_list = [] acid_or_base_list = [] for i in range(n_acidic + n_basic): x = splitline.pop(0) if x == '': continue pKa_list.append(float(x)) if i < n_acidic: acid_or_base_list.append('acid') else: acid_or_base_list.append('base') atom_list = splitline.pop(0) if atom_list: # a comma separated list of the deprotonated atoms atom_numbers = [int(y)-1 for y in atom_list.split(',')] for i, j in enumerate(atom_numbers): atom2pKa.setdefault(j, []) atom2pKa[j].append((pKa_list[i], acid_or_base_list[i])) smiles_list = splitline return atom2pKa, smiles_list def GetDissociationConstants_val(molstring, n_acidic=N_PKAS, n_basic=N_PKAS, pH=MID_PH): """ Returns: A pair of (pKa list, major pseudoisomer) - the pKa list is of the pKa values in ascending order. - the major pseudoisomer is a SMILES string of the major species at the given pH. """ args = [] if n_acidic + n_basic > 0: args += ['pka', '-a', str(n_acidic), '-b', str(n_basic), 'majorms', '-M', 'true', '--pH', str(pH)] output = RunCxcalc(molstring, args) atom2pKa, smiles_list = ParsePkaOutput(output, n_acidic, n_basic) all_pKas = [] for pKa_list in list(atom2pKa.values()): all_pKas += [pKa for pKa, _ in pKa_list] return sorted(all_pKas), smiles_list def GetDissociationConstants(molstring, n_acidic=N_PKAS, n_basic=N_PKAS, pH=MID_PH): """ Arguments: molstring - a text description of the molecule (SMILES or InChI) n_acidic - the max no. of acidic pKas to calculate n_basic - the max no. of basic pKas to calculate pH - the pH for which the major pseudoisomer is calculated Returns a pair: (all_pKas, major_ms) - all_pKas is a list of floats (pKa values) - major_ms is a SMILES string of the major pseudoisomer at pH_mid """ all_pKas, smiles_list = GetDissociationConstants_val(molstring, n_acidic, n_basic, pH) major_ms = smiles_list[0] return all_pKas, major_ms def GetFormulaAndCharge(molstring): """ Arguments: molstring - a text description of the molecule (SMILES or InChI) Returns: chemical formula of the molecule """ args = ['formula', 'formalcharge'] output = RunCxcalc(molstring, args) # the output is a tab separated table whose columns are: # id, Formula, Formal charge f = io.StringIO(output) tsv_output = csv.reader(f, delimiter='\t') headers = next(tsv_output) if headers != ['id', 'Formula', 'Formal charge']: raise ChemAxonError( 'cannot get the formula and charge for: ' + molstring) _, formula, formal_charge = next(tsv_output) try: formal_charge = int(formal_charge) except ValueError: formal_charge = 0 return formula, formal_charge def GetAtomBagAndCharge(molstring): formula, formal_charge = GetFormulaAndCharge(molstring) periodic_table = rdchem.GetPeriodicTable() atom_bag = {} for mol_formula_times in formula.split('.'): for times, mol_formula in re.findall('^(\d+)?(\w+)', mol_formula_times): if not times: times = 1 else: times = int(times) for atom, count in re.findall("([A-Z][a-z]*)([0-9]*)", mol_formula): if count == '': count = 1 else: count = int(count) atom_bag[atom] = atom_bag.get(atom, 0) + count * times n_protons = sum([c * periodic_table.GetAtomicNumber(str(elem)) for (elem, c) in atom_bag.items()]) atom_bag['e-'] = n_protons - formal_charge return atom_bag, formal_charge if __name__ == "__main__": logging.getLogger().setLevel(logging.WARNING) from molecule import Molecule compound_list = [ ('D-Erythrulose', 'InChI=1S/C4H8O4/c5-1-3(7)4(8)2-6/h3,5-7H,1-2H2/t3-/m1/s1')] for name, inchi in compound_list: print("Formula: %s\nCharge: %d" % GetFormulaAndCharge(inchi)) diss_table, major_ms = GetDissociationConstants(inchi) m = Molecule.FromSmiles(major_ms) print("Name: %s\nInChI: %s\npKas: %s" % (name, m.ToInChI(), str(diss_table)))