"""Submission-related utilities.""" import os import json import logging import numpy as np import pandas as pd from io import StringIO from typing import Optional from sklearn.preprocessing import StandardScaler from configuration import ( GENE_EXPRESSION_DATA, GENE_EXPRESSION_METADATA, GENES, GENE_STANDARDIZATION_PARAMETERS, ) from cos import RESULTS_PREFIX, string_to_key from forward import predict # from attention import upload_attention logger = logging.getLogger("openapi_server:submission") def submission( drug: dict, workspace_id: str, task_id: str, estimate_confidence: bool = False, omics_file: Optional[str] = None, ) -> None: """ Submit PaccMann prediction Args: drug (dict): drug to analyse in dictionary format. workspace_id (str): workspace identifier for the submission. task_id (str): task identifier. estimate_confidence (bool, optional): estimate confidence of the prediction. Defaults to False. omics_file (Optional[str], optional): path to file containing expression data. Defaults to None. """ prefix = os.path.join(RESULTS_PREFIX, workspace_id, task_id) logger.debug("processing omic data.") # NOTE: this trick is used in case a single example is passed single_example = False result = {} if omics_file is None: gene_expression, gene_expression_metadata = ( GENE_EXPRESSION_DATA, GENE_EXPRESSION_METADATA, ) else: logger.debug("parsing uploaded omic data.") logger.debug(omics_file) gene_expression_df = pd.read_csv(omics_file, low_memory=False) logger.debug(gene_expression_df.columns) to_drop = list(set(GENES) & set(gene_expression_df.columns)) gene_expression_data, gene_expression_metadata = ( gene_expression_df.T.reindex(GENES).fillna(0.0).T, gene_expression_df.drop(to_drop, axis=1), ) logger.debug("peek parsed expression and metadata.") logger.debug("gene_expression_data:\n{}".format(gene_expression_data.head())) logger.debug( "gene_expression_metadata:\n{}".format(gene_expression_metadata.head()) ) if gene_expression_data.shape[0] < 2: logger.debug( "single example, standardizing with default parameters:\n{}".format( GENE_STANDARDIZATION_PARAMETERS ) ) single_example = True gene_expression = ( gene_expression_data.values - GENE_STANDARDIZATION_PARAMETERS[0] ) / GENE_STANDARDIZATION_PARAMETERS[1] gene_expression = np.vstack(2 * [gene_expression]) logger.debug(gene_expression.shape) else: gene_expression = StandardScaler().fit_transform( gene_expression_data.values ) logger.debug("gene_expression:\n{}".format(gene_expression[:10])) logger.debug("omic data prepared if present.") prediction_dict = predict( smiles=drug["smiles"], gene_expression=gene_expression, estimate_confidence=estimate_confidence, ) # from tensors for key, value in prediction_dict.items(): prediction_dict[key] = value.numpy()[:1] if single_example else value.numpy() result.update(prediction_dict) # merge for single table, index is unique identifier for samples. gene_expression_metadata["IC50 (min/max scaled)"] = prediction_dict["IC50"] gene_expression_metadata["IC50 (log(μmol))"] = prediction_dict[ "log_micromolar_IC50" ] if estimate_confidence: gene_expression_metadata["epistemic_confidence"] = prediction_dict[ "epistemic_confidence" ] gene_expression_metadata["aleatoric_confidence"] = prediction_dict[ "aleatoric_confidence" ] logger.debug("uploaded predicted sensitivity table including metadata.") # attention # result.update( # upload_attention( # prefix, # sample_names=list(map(str, gene_expression_metadata.index)), # omic_attention=prediction_dict["gene_attention"], # smiles_attention=prediction_dict["smiles_attention"], # ) # ) logger.debug("uploaded attention for each sample.") logger.debug("uploading drug information and sensitivity.") # prediction (is sensitivity_json in API) logger.debug("uploaded drug information and sensitivity.") # NOTE: Ordering corresponds to IDs in GEP metadata! return result