File size: 2,797 Bytes
d05f89f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# -*- coding: utf-8 -*-
# The B3clf library computes the blood-brain barrier (BBB) permeability
# of organic molecules with resampling strategies.
#
# Copyright (C) 2021 The Ayers Lab
#
# This file is part of B3clf.
#
# B3clf is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# B3clf is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>
#
# --

import os
import sys

cwd = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(cwd, "padelpy"))

import pandas as pd
from rdkit import Chem
from padelpy import from_sdf

"""Compute PaDEL descriptors."""


def compute_descriptors(sdf_file,
                        # Change this to be an optional argument
                        excel_out="padel_descriptors.xlsx",
                        output_csv=None,
                        timeout=None,
                        time_per_molecule=-1,
                        ) -> pd.DataFrame:
    """Compute the chemical descriptors with PaDEL.

    Parameters
    ----------
    sdf_file : str
        Input SDF file name.
    excel_out : str, optional
        Excel file name to save PaDEL descriptors.
    timeout : float
        The maximum time, in seconds, for calculating the descriptors. When set to be None,
        this does not take effect.

    Returns
    -------
    df_desc : pandas.dataframe
        The computed pandas dataframe of PaDEL descriptors.

    """
    desc = from_sdf(sdf_file=sdf_file,
                    output_csv=output_csv,
                    descriptors=True,
                    fingerprints=False,
                    timeout=timeout,
                    maxruntime=time_per_molecule,
                    )
    df_desc = pd.DataFrame(desc)

    # add molecule names to dataframe
    suppl = Chem.SDMolSupplier(sdf_file,
                               sanitize=True,
                               removeHs=False,
                               strictParsing=True)
    mol_names = [mol.GetProp("_Name") for mol in suppl]
    df_desc.index = mol_names
    df_desc.index.name = "ID"

    # drop rows with nan values
    # todo: add imputation option
    df_desc.dropna(axis=0, inplace=True)

    # save results
    if excel_out is not None:
        df_desc.to_excel(excel_out, engine="openpyxl")

    return df_desc

    # Index will be the molecule's name