File size: 6,659 Bytes
6d990bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import logging
import csv
import re
import platform
import io
from subprocess import Popen, PIPE
from openbabel import openbabel
import pdb
from rdkit.Chem import rdchem

if platform.system() == 'Windows':
    CXCALC_BIN = 'C:\\Users\\vuu10\\AppData\\Local\\Programs\\ChemAxon\\MarvinSuite\\bin\\cxcalc.exe'
    #CXCALC_BIN = 'C:\\Program Files (x86)\\ChemAxon\\MarvinBeans\\bin\\cxcalc.bat'
    use_shell_for_echo = True
else:
    CXCALC_BIN = 'cxcalc'
    use_shell_for_echo = False

MID_PH = 7.0
N_PKAS = 20


class ChemAxonError(Exception):
    pass


def RunCxcalc(molstring, args):
    # pdb.set_trace()
    #     with open(platform.DEV_NULL, 'w') as dev_null:
    try:
        logging.debug("INPUT: echo %s | %s" %
                      (molstring, ' '.join([CXCALC_BIN] + args)))
        p1 = Popen(["echo", molstring], stdout=PIPE,
                   shell=use_shell_for_echo)
# 		p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,
# 				   executable=CXCALC_BIN, stdout=PIPE, stderr=dev_null, shell=False)
        p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,
                   executable=CXCALC_BIN, stdout=PIPE, shell=False)
        # p.wait()
        # os.remove(temp_fname)
        res = p2.communicate()[0]
        if p2.returncode != 0:
            raise ChemAxonError(str(args))
        logging.debug("OUTPUT: %s" % res)
        res = res.decode('utf-8')
        return res
    except OSError:
        raise Exception(
            "Marvin (by ChemAxon) must be installed to calculate pKa data.")


def ParsePkaOutput(s, n_acidic, n_basic):
    """
        Returns:
            A dictionary that maps the atom index to a list of pKas
            that are assigned to that atom.
    """
#     s = s.decode('utf-8')
    atom2pKa = {}

    pkaline = s.split('\n')[1]
    splitline = pkaline.split('\t')
    splitline.pop(0)

    if n_acidic + n_basic > 0:
        if len(splitline) != (n_acidic + n_basic + 2):
            raise ChemAxonError('ChemAxon failed to find any pKas')

        pKa_list = []
        acid_or_base_list = []
        for i in range(n_acidic + n_basic):
            x = splitline.pop(0)
            if x == '':
                continue

            pKa_list.append(float(x))
            if i < n_acidic:
                acid_or_base_list.append('acid')
            else:
                acid_or_base_list.append('base')

        atom_list = splitline.pop(0)

        if atom_list:  # a comma separated list of the deprotonated atoms
            atom_numbers = [int(y)-1 for y in atom_list.split(',')]
            for i, j in enumerate(atom_numbers):
                atom2pKa.setdefault(j, [])
                atom2pKa[j].append((pKa_list[i], acid_or_base_list[i]))

    smiles_list = splitline
    return atom2pKa, smiles_list


def GetDissociationConstants_val(molstring, n_acidic=N_PKAS, n_basic=N_PKAS,
                                 pH=MID_PH):
    """
        Returns:
            A pair of (pKa list, major pseudoisomer)

            - the pKa list is of the pKa values in ascending order.
            - the major pseudoisomer is a SMILES string of the major species
              at the given pH.
    """
    args = []
    if n_acidic + n_basic > 0:
        args += ['pka', '-a', str(n_acidic), '-b', str(n_basic),
                 'majorms', '-M', 'true', '--pH', str(pH)]

    output = RunCxcalc(molstring, args)
    atom2pKa, smiles_list = ParsePkaOutput(output, n_acidic, n_basic)

    all_pKas = []
    for pKa_list in list(atom2pKa.values()):
        all_pKas += [pKa for pKa, _ in pKa_list]

    return sorted(all_pKas), smiles_list


def GetDissociationConstants(molstring, n_acidic=N_PKAS, n_basic=N_PKAS,
                             pH=MID_PH):
    """
        Arguments:
            molstring - a text description of the molecule (SMILES or InChI)
            n_acidic  - the max no. of acidic pKas to calculate
            n_basic   - the max no. of basic pKas to calculate
            pH        - the pH for which the major pseudoisomer is calculated

        Returns a pair:
            (all_pKas, major_ms)

        - all_pKas is a list of floats (pKa values)
        - major_ms is a SMILES string of the major pseudoisomer at pH_mid
    """
    all_pKas, smiles_list = GetDissociationConstants_val(molstring, n_acidic,
                                                         n_basic, pH)
    major_ms = smiles_list[0]
    return all_pKas, major_ms


def GetFormulaAndCharge(molstring):
    """
        Arguments:
            molstring - a text description of the molecule (SMILES or InChI)

        Returns:
            chemical formula of the molecule
    """
    args = ['formula', 'formalcharge']
    output = RunCxcalc(molstring, args)
    # the output is a tab separated table whose columns are:
    # id, Formula, Formal charge
    f = io.StringIO(output)
    tsv_output = csv.reader(f, delimiter='\t')
    headers = next(tsv_output)
    if headers != ['id', 'Formula', 'Formal charge']:
        raise ChemAxonError(
            'cannot get the formula and charge for: ' + molstring)
    _, formula, formal_charge = next(tsv_output)

    try:
        formal_charge = int(formal_charge)
    except ValueError:
        formal_charge = 0

    return formula, formal_charge


def GetAtomBagAndCharge(molstring):
    formula, formal_charge = GetFormulaAndCharge(molstring)
    periodic_table = rdchem.GetPeriodicTable()

    atom_bag = {}
    for mol_formula_times in formula.split('.'):
        for times, mol_formula in re.findall('^(\d+)?(\w+)', mol_formula_times):
            if not times:
                times = 1
            else:
                times = int(times)
            for atom, count in re.findall("([A-Z][a-z]*)([0-9]*)", mol_formula):
                if count == '':
                    count = 1
                else:
                    count = int(count)
                atom_bag[atom] = atom_bag.get(atom, 0) + count * times

    n_protons = sum([c * periodic_table.GetAtomicNumber(str(elem))
                     for (elem, c) in atom_bag.items()])
    atom_bag['e-'] = n_protons - formal_charge

    return atom_bag, formal_charge


if __name__ == "__main__":
    logging.getLogger().setLevel(logging.WARNING)
    from molecule import Molecule
    compound_list = [
        ('D-Erythrulose', 'InChI=1S/C4H8O4/c5-1-3(7)4(8)2-6/h3,5-7H,1-2H2/t3-/m1/s1')]

    for name, inchi in compound_list:
        print("Formula: %s\nCharge: %d" % GetFormulaAndCharge(inchi))
        diss_table, major_ms = GetDissociationConstants(inchi)
        m = Molecule.FromSmiles(major_ms)
        print("Name: %s\nInChI: %s\npKas: %s" %
              (name, m.ToInChI(), str(diss_table)))