Spaces:

QCDevs
/

Selector_GSoC

Sleeping

App Files Files Community

Selector_GSoC / streamlit_app /pages /MaxMin.py

legend1234

Initial draft (#252)

bbcd4a0 unverified 11 months ago

raw

history blame contribute delete

4.35 kB

	# The Selector library provides a set of tools for selecting a
	# subset of the dataset and computing diversity.
	#
	# Copyright (C) 2023 The QC-Devs Community
	#
	# This file is part of Selector.
	#
	# Selector is free software; you can redistribute it and/or
	# modify it under the terms of the GNU General Public License
	# as published by the Free Software Foundation; either version 3
	# of the License, or (at your option) any later version.
	#
	# Selector is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program; if not, see <http://www.gnu.org/licenses/>
	#
	# --

	import streamlit as st
	import os
	import sys

	from sklearn.metrics import pairwise_distances
	from selector.methods.distance import MaxMin

	# Add the streamlit_app directory to the Python path
	current_dir = os.path.dirname(os.path.abspath(__file__))
	parent_dir = os.path.join(current_dir, "..")
	sys.path.append(parent_dir)

	from utils import *

	# Set page configuration
	st.set_page_config(
	page_title = "MaxMin",
	page_icon = os.path.join(parent_dir, "assets" , "QC-Devs.png"),
	)

	st.title("Brute Strength - MaxMin")

	description = """
	MaxMin is possibly the most widely used method for dissimilarity-based
	compound selection. When presented with a dataset of samples, the
	initial point is chosen as the dataset's medoid center. Next, the second
	point is chosen to be that which is furthest from this initial point.
	Subsequently, all following points are selected via the following
	logic:

	1. Find the minimum distance from every point to the already-selected ones.
	2. Select the point which has the maximum distance among those calculated
	in the previous step.

	In the current implementation, this method requires or computes the full pairwise-distance
	matrix, so it is not recommended for large datasets.
	"""


	references = "[1] Ashton, Mark, et al., Identification of diverse database subsets using "\
	"property‐based and fragment‐based molecular descriptions, "\
	"Quantitative Structure‐Activity Relationships 21.6 (2002): 598-604."

	display_sidebar_info("Brute Strength - MaxMin", description, references)

	# File uploader for feature matrix or distance matrix (required)
	matrix_file = st.file_uploader("Upload a feature matrix or distance matrix (required)",
	type=["csv", "xlsx", "npz", "npy"], key="matrix_file", on_change=clear_results)

	# Clear selected indices if a new matrix file is uploaded
	if matrix_file is None:
	st.session_state.pop("selected_ids", None)
	# Load data from matrix file
	else:
	matrix = load_matrix(matrix_file)
	num_points = st.number_input("Number of points to select", min_value = 1, step = 1,
	key = "num_points", on_change=clear_results)
	label_file = st.file_uploader("Upload a cluster label list (optional)", type = ["csv", "xlsx"],
	key = "label_file", on_change=clear_results)
	labels = load_labels(label_file) if label_file else None

	distance_metric = st.selectbox("Select distance metric (optional)",
	[None, "euclidean", "manhattan", "cosine"],
	key = "distance_metric", on_change=clear_results)

	if distance_metric:
	fun_dist = lambda x: pairwise_distances(x, metric = distance_metric)
	else:
	fun_dist = None

	if st.button("Run MaxMin Algorithm"):
	if fun_dist:
	selector = MaxMin(fun_dist)
	selected_ids = run_algorithm(selector, matrix, num_points, labels)
	else:
	selector = MaxMin()
	selected_ids = run_algorithm(selector, matrix, num_points, labels)

	st.session_state['selector'] = selector
	st.session_state['selected_ids'] = selected_ids


	# Check if the selected indices are stored in the session state
	if 'selected_ids' in st.session_state and matrix_file is not None:
	selected_ids = st.session_state['selected_ids']
	st.write("Selected indices:", selected_ids)

	export_results(selected_ids)