Selector_GSoC / streamlit_app /pages /FeatureMatrixConversion.py
legend1234's picture
Cleaning up (#258)
ab07025 unverified
import streamlit as st
from rdkit import Chem
from rdkit.Chem import Draw
import tempfile
import os
import sys
# Add the streamlit_app directory to the Python path
current_dir = os.path.dirname(os.path.abspath(__file__))
module_path = os.path.abspath(os.path.join(current_dir, '..', '..'))
sys.path.append(module_path)
parent_dir = os.path.join(current_dir, "..")
sys.path.append(parent_dir)
from utils import *
from streamlit_app.features import DescriptorGenerator, FingerprintGenerator
# Set up the page configuration
set_page_config(
page_title = "Chem Converter",
page_icon = os.path.join(parent_dir, "assets", "QC-Devs.png")
)
st.title("Chemical File Converter")
# Description of the page
st.markdown("""
This page allows you to upload raw chemical file formats such as SMILES or SDF,
and convert them into chemical matrices that can be used as input for selector's various algorithms.
""")
# File uploader for chemical file
chemical_file = st.file_uploader("Upload a chemical file (e.g., SMILES, SDF, or TXT)",
type = ["txt", "smi", "sdf"])
if chemical_file:
# User selects the file format
file_format = st.selectbox(
"Select the format of the provided file",
options = ["", "SMILES", "SDF"]
)
if file_format:
molecules = []
temp_sdf_path = None
# Process the chemical file based on user selection
if file_format == "SMILES":
smiles_list = chemical_file.read().decode("utf-8").splitlines()
molecules = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
elif file_format == "SDF":
# Create a temporary file to save the uploaded SDF content
with tempfile.NamedTemporaryFile(delete = False, suffix = ".sdf") as temp_sdf:
temp_sdf.write(chemical_file.read())
temp_sdf_path = temp_sdf.name
# Use RDKit's SDMolSupplier to read molecules from the SDF file
supplier = Chem.SDMolSupplier(temp_sdf_path)
molecules = [mol for mol in supplier if mol is not None]
# Explicitly close the supplier to release the file
del supplier
# Check for valid molecules
valid_molecules = [mol for mol in molecules if mol is not None]
if not valid_molecules:
st.error("No valid molecules found in the uploaded file.")
else:
st.success(f"Successfully loaded {len(valid_molecules)} valid molecules.")
# Display the molecules
img = Draw.MolsToImage(valid_molecules)
st.image(img, caption = "Molecules in the file")
# Choose the type of matrix to generate
matrix_type = st.selectbox("Choose matrix type", ["Descriptors", "Fingerprints"])
if matrix_type == "Descriptors":
# Allow the user to choose the type of descriptors to generate
use_fragment = st.checkbox("Whether return value includes the fragment binary descriptors", value = True)
ipc_avg = st.checkbox("Whether IPC descriptor calculates with avg", value = True)
descriptor_generator = DescriptorGenerator(valid_molecules)
matrix = descriptor_generator.rdkit_desc(use_fragment, ipc_avg)
elif matrix_type == "Fingerprints":
# Allow user to choose the type of fingerprint to generate
fp_type = st.selectbox("Select Fingerprint Type", options=["SECFP", "ECFP", "Morgan"])
n_bits = st.number_input("Number of bits for the fingerprint", min_value = 1, value = 2048)
radius = st.number_input("The maximum radius of the substructure that is generated at each atom", min_value = 1, value = 3)
min_radius = st.number_input("The minimum radius that is used to extract n-grams", min_value = 1, value = 3)
random_seed = st.number_input("Random seed for fingerprint generation", min_value = 0, value = 12345)
rings = st.checkbox("Whether the rings (SSSR) are extracted from the molecule and added to the shingling", value = True)
isomeric = st.checkbox("Whether the SMILES added to the shingling are isomeric", value = True)
kekulize = st.checkbox("Whether the SMILES added to the shingling are kekulized", value = False)
fp_generator = FingerprintGenerator(valid_molecules)
matrix = fp_generator.compute_fingerprint(fp_type = fp_type)
st.write("Generated Chemical Matrix:")
st.dataframe(matrix)
# Option to download the matrix as CSV
csv_data = matrix.to_csv().encode('utf-8')
st.download_button("Download Chemical Matrix as CSV", data = csv_data,
file_name = "chemical_matrix.csv", mime = "text/csv")
# Clean up the temporary file after RDKit is done with it
if temp_sdf_path and os.path.exists(temp_sdf_path):
os.remove(temp_sdf_path)