B3clf / b3clf /descriptor_padel.py

Synced repo using 'sync_with_huggingface' Github Action

d05f89f 9 months ago

No virus

2.8 kB

	# -- coding: utf-8 --
	# The B3clf library computes the blood-brain barrier (BBB) permeability
	# of organic molecules with resampling strategies.
	#
	# Copyright (C) 2021 The Ayers Lab
	#
	# This file is part of B3clf.
	#
	# B3clf is free software; you can redistribute it and/or
	# modify it under the terms of the GNU General Public License
	# as published by the Free Software Foundation; either version 3
	# of the License, or (at your option) any later version.
	#
	# B3clf is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program; if not, see <http://www.gnu.org/licenses/>
	#
	# --

	import os
	import sys

	cwd = os.path.dirname(os.path.abspath(__file__))
	sys.path.append(os.path.join(cwd, "padelpy"))

	import pandas as pd
	from rdkit import Chem
	from padelpy import from_sdf

	"""Compute PaDEL descriptors."""


	def compute_descriptors(sdf_file,
	# Change this to be an optional argument
	excel_out="padel_descriptors.xlsx",
	output_csv=None,
	timeout=None,
	time_per_molecule=-1,
	) -> pd.DataFrame:
	"""Compute the chemical descriptors with PaDEL.

	Parameters
	----------
	sdf_file : str
	Input SDF file name.
	excel_out : str, optional
	Excel file name to save PaDEL descriptors.
	timeout : float
	The maximum time, in seconds, for calculating the descriptors. When set to be None,
	this does not take effect.

	Returns
	-------
	df_desc : pandas.dataframe
	The computed pandas dataframe of PaDEL descriptors.

	"""
	desc = from_sdf(sdf_file=sdf_file,
	output_csv=output_csv,
	descriptors=True,
	fingerprints=False,
	timeout=timeout,
	maxruntime=time_per_molecule,
	)
	df_desc = pd.DataFrame(desc)

	# add molecule names to dataframe
	suppl = Chem.SDMolSupplier(sdf_file,
	sanitize=True,
	removeHs=False,
	strictParsing=True)
	mol_names = [mol.GetProp("_Name") for mol in suppl]
	df_desc.index = mol_names
	df_desc.index.name = "ID"

	# drop rows with nan values
	# todo: add imputation option
	df_desc.dropna(axis=0, inplace=True)

	# save results
	if excel_out is not None:
	df_desc.to_excel(excel_out, engine="openpyxl")

	return df_desc

	# Index will be the molecule's name