legend1234's picture
Initial draft (#252)
bbcd4a0 unverified
# The Selector library provides a set of tools for selecting a
# subset of the dataset and computing diversity.
#
# Copyright (C) 2023 The QC-Devs Community
#
# This file is part of Selector.
#
# Selector is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# Selector is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>
#
# --
import streamlit as st
import os
import sys
from sklearn.metrics import pairwise_distances
from selector.methods.distance import MaxMin
# Add the streamlit_app directory to the Python path
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.join(current_dir, "..")
sys.path.append(parent_dir)
from utils import *
# Set page configuration
st.set_page_config(
page_title = "MaxMin",
page_icon = os.path.join(parent_dir, "assets" , "QC-Devs.png"),
)
st.title("Brute Strength - MaxMin")
description = """
MaxMin is possibly the most widely used method for dissimilarity-based
compound selection. When presented with a dataset of samples, the
initial point is chosen as the dataset's medoid center. Next, the second
point is chosen to be that which is furthest from this initial point.
Subsequently, all following points are selected via the following
logic:
1. Find the minimum distance from every point to the already-selected ones.
2. Select the point which has the maximum distance among those calculated
in the previous step.
In the current implementation, this method requires or computes the full pairwise-distance
matrix, so it is not recommended for large datasets.
"""
references = "[1] Ashton, Mark, et al., Identification of diverse database subsets using "\
"property‐based and fragment‐based molecular descriptions, "\
"Quantitative Structure‐Activity Relationships 21.6 (2002): 598-604."
display_sidebar_info("Brute Strength - MaxMin", description, references)
# File uploader for feature matrix or distance matrix (required)
matrix_file = st.file_uploader("Upload a feature matrix or distance matrix (required)",
type=["csv", "xlsx", "npz", "npy"], key="matrix_file", on_change=clear_results)
# Clear selected indices if a new matrix file is uploaded
if matrix_file is None:
st.session_state.pop("selected_ids", None)
# Load data from matrix file
else:
matrix = load_matrix(matrix_file)
num_points = st.number_input("Number of points to select", min_value = 1, step = 1,
key = "num_points", on_change=clear_results)
label_file = st.file_uploader("Upload a cluster label list (optional)", type = ["csv", "xlsx"],
key = "label_file", on_change=clear_results)
labels = load_labels(label_file) if label_file else None
distance_metric = st.selectbox("Select distance metric (optional)",
[None, "euclidean", "manhattan", "cosine"],
key = "distance_metric", on_change=clear_results)
if distance_metric:
fun_dist = lambda x: pairwise_distances(x, metric = distance_metric)
else:
fun_dist = None
if st.button("Run MaxMin Algorithm"):
if fun_dist:
selector = MaxMin(fun_dist)
selected_ids = run_algorithm(selector, matrix, num_points, labels)
else:
selector = MaxMin()
selected_ids = run_algorithm(selector, matrix, num_points, labels)
st.session_state['selector'] = selector
st.session_state['selected_ids'] = selected_ids
# Check if the selected indices are stored in the session state
if 'selected_ids' in st.session_state and matrix_file is not None:
selected_ids = st.session_state['selected_ids']
st.write("Selected indices:", selected_ids)
export_results(selected_ids)