dGPredictor / CC /chemaxon.py
vuu10's picture
Upload 6 files
6d990bb
raw
history blame
6.66 kB
import logging
import csv
import re
import platform
import io
from subprocess import Popen, PIPE
from openbabel import openbabel
import pdb
from rdkit.Chem import rdchem
if platform.system() == 'Windows':
CXCALC_BIN = 'C:\\Users\\vuu10\\AppData\\Local\\Programs\\ChemAxon\\MarvinSuite\\bin\\cxcalc.exe'
#CXCALC_BIN = 'C:\\Program Files (x86)\\ChemAxon\\MarvinBeans\\bin\\cxcalc.bat'
use_shell_for_echo = True
else:
CXCALC_BIN = 'cxcalc'
use_shell_for_echo = False
MID_PH = 7.0
N_PKAS = 20
class ChemAxonError(Exception):
pass
def RunCxcalc(molstring, args):
# pdb.set_trace()
# with open(platform.DEV_NULL, 'w') as dev_null:
try:
logging.debug("INPUT: echo %s | %s" %
(molstring, ' '.join([CXCALC_BIN] + args)))
p1 = Popen(["echo", molstring], stdout=PIPE,
shell=use_shell_for_echo)
# p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,
# executable=CXCALC_BIN, stdout=PIPE, stderr=dev_null, shell=False)
p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,
executable=CXCALC_BIN, stdout=PIPE, shell=False)
# p.wait()
# os.remove(temp_fname)
res = p2.communicate()[0]
if p2.returncode != 0:
raise ChemAxonError(str(args))
logging.debug("OUTPUT: %s" % res)
res = res.decode('utf-8')
return res
except OSError:
raise Exception(
"Marvin (by ChemAxon) must be installed to calculate pKa data.")
def ParsePkaOutput(s, n_acidic, n_basic):
"""
Returns:
A dictionary that maps the atom index to a list of pKas
that are assigned to that atom.
"""
# s = s.decode('utf-8')
atom2pKa = {}
pkaline = s.split('\n')[1]
splitline = pkaline.split('\t')
splitline.pop(0)
if n_acidic + n_basic > 0:
if len(splitline) != (n_acidic + n_basic + 2):
raise ChemAxonError('ChemAxon failed to find any pKas')
pKa_list = []
acid_or_base_list = []
for i in range(n_acidic + n_basic):
x = splitline.pop(0)
if x == '':
continue
pKa_list.append(float(x))
if i < n_acidic:
acid_or_base_list.append('acid')
else:
acid_or_base_list.append('base')
atom_list = splitline.pop(0)
if atom_list: # a comma separated list of the deprotonated atoms
atom_numbers = [int(y)-1 for y in atom_list.split(',')]
for i, j in enumerate(atom_numbers):
atom2pKa.setdefault(j, [])
atom2pKa[j].append((pKa_list[i], acid_or_base_list[i]))
smiles_list = splitline
return atom2pKa, smiles_list
def GetDissociationConstants_val(molstring, n_acidic=N_PKAS, n_basic=N_PKAS,
pH=MID_PH):
"""
Returns:
A pair of (pKa list, major pseudoisomer)
- the pKa list is of the pKa values in ascending order.
- the major pseudoisomer is a SMILES string of the major species
at the given pH.
"""
args = []
if n_acidic + n_basic > 0:
args += ['pka', '-a', str(n_acidic), '-b', str(n_basic),
'majorms', '-M', 'true', '--pH', str(pH)]
output = RunCxcalc(molstring, args)
atom2pKa, smiles_list = ParsePkaOutput(output, n_acidic, n_basic)
all_pKas = []
for pKa_list in list(atom2pKa.values()):
all_pKas += [pKa for pKa, _ in pKa_list]
return sorted(all_pKas), smiles_list
def GetDissociationConstants(molstring, n_acidic=N_PKAS, n_basic=N_PKAS,
pH=MID_PH):
"""
Arguments:
molstring - a text description of the molecule (SMILES or InChI)
n_acidic - the max no. of acidic pKas to calculate
n_basic - the max no. of basic pKas to calculate
pH - the pH for which the major pseudoisomer is calculated
Returns a pair:
(all_pKas, major_ms)
- all_pKas is a list of floats (pKa values)
- major_ms is a SMILES string of the major pseudoisomer at pH_mid
"""
all_pKas, smiles_list = GetDissociationConstants_val(molstring, n_acidic,
n_basic, pH)
major_ms = smiles_list[0]
return all_pKas, major_ms
def GetFormulaAndCharge(molstring):
"""
Arguments:
molstring - a text description of the molecule (SMILES or InChI)
Returns:
chemical formula of the molecule
"""
args = ['formula', 'formalcharge']
output = RunCxcalc(molstring, args)
# the output is a tab separated table whose columns are:
# id, Formula, Formal charge
f = io.StringIO(output)
tsv_output = csv.reader(f, delimiter='\t')
headers = next(tsv_output)
if headers != ['id', 'Formula', 'Formal charge']:
raise ChemAxonError(
'cannot get the formula and charge for: ' + molstring)
_, formula, formal_charge = next(tsv_output)
try:
formal_charge = int(formal_charge)
except ValueError:
formal_charge = 0
return formula, formal_charge
def GetAtomBagAndCharge(molstring):
formula, formal_charge = GetFormulaAndCharge(molstring)
periodic_table = rdchem.GetPeriodicTable()
atom_bag = {}
for mol_formula_times in formula.split('.'):
for times, mol_formula in re.findall('^(\d+)?(\w+)', mol_formula_times):
if not times:
times = 1
else:
times = int(times)
for atom, count in re.findall("([A-Z][a-z]*)([0-9]*)", mol_formula):
if count == '':
count = 1
else:
count = int(count)
atom_bag[atom] = atom_bag.get(atom, 0) + count * times
n_protons = sum([c * periodic_table.GetAtomicNumber(str(elem))
for (elem, c) in atom_bag.items()])
atom_bag['e-'] = n_protons - formal_charge
return atom_bag, formal_charge
if __name__ == "__main__":
logging.getLogger().setLevel(logging.WARNING)
from molecule import Molecule
compound_list = [
('D-Erythrulose', 'InChI=1S/C4H8O4/c5-1-3(7)4(8)2-6/h3,5-7H,1-2H2/t3-/m1/s1')]
for name, inchi in compound_list:
print("Formula: %s\nCharge: %d" % GetFormulaAndCharge(inchi))
diss_table, major_ms = GetDissociationConstants(inchi)
m = Molecule.FromSmiles(major_ms)
print("Name: %s\nInChI: %s\npKas: %s" %
(name, m.ToInChI(), str(diss_table)))