paccmann / submission.py
jannisborn's picture
update
ed32193 unverified
raw
history blame
4.63 kB
"""Submission-related utilities."""
import os
import json
import logging
import numpy as np
import pandas as pd
from io import StringIO
from typing import Optional
from sklearn.preprocessing import StandardScaler
from configuration import (
GENE_EXPRESSION_DATA,
GENE_EXPRESSION_METADATA,
GENES,
GENE_STANDARDIZATION_PARAMETERS,
)
from cos import RESULTS_PREFIX, string_to_key
from forward import predict
# from attention import upload_attention
logger = logging.getLogger("openapi_server:submission")
def submission(
drug: dict,
workspace_id: str,
task_id: str,
estimate_confidence: bool = False,
omics_file: Optional[str] = None,
) -> None:
"""
Submit PaccMann prediction
Args:
drug (dict): drug to analyse in dictionary format.
workspace_id (str): workspace identifier for the submission.
task_id (str): task identifier.
estimate_confidence (bool, optional): estimate confidence of the
prediction. Defaults to False.
omics_file (Optional[str], optional): path to file containing
expression data. Defaults to None.
"""
prefix = os.path.join(RESULTS_PREFIX, workspace_id, task_id)
logger.debug("processing omic data.")
# NOTE: this trick is used in case a single example is passed
single_example = False
result = {}
if omics_file is None:
gene_expression, gene_expression_metadata = (
GENE_EXPRESSION_DATA,
GENE_EXPRESSION_METADATA,
)
else:
logger.debug("parsing uploaded omic data.")
logger.debug(omics_file)
gene_expression_df = pd.read_csv(omics_file, low_memory=False)
logger.debug(gene_expression_df.columns)
to_drop = list(set(GENES) & set(gene_expression_df.columns))
gene_expression_data, gene_expression_metadata = (
gene_expression_df.T.reindex(GENES).fillna(0.0).T,
gene_expression_df.drop(to_drop, axis=1),
)
logger.debug("peek parsed expression and metadata.")
logger.debug("gene_expression_data:\n{}".format(gene_expression_data.head()))
logger.debug(
"gene_expression_metadata:\n{}".format(gene_expression_metadata.head())
)
if gene_expression_data.shape[0] < 2:
logger.debug(
"single example, standardizing with default parameters:\n{}".format(
GENE_STANDARDIZATION_PARAMETERS
)
)
single_example = True
gene_expression = (
gene_expression_data.values - GENE_STANDARDIZATION_PARAMETERS[0]
) / GENE_STANDARDIZATION_PARAMETERS[1]
gene_expression = np.vstack(2 * [gene_expression])
logger.debug(gene_expression.shape)
else:
gene_expression = StandardScaler().fit_transform(
gene_expression_data.values
)
logger.debug("gene_expression:\n{}".format(gene_expression[:10]))
logger.debug("omic data prepared if present.")
prediction_dict = predict(
smiles=drug["smiles"],
gene_expression=gene_expression,
estimate_confidence=estimate_confidence,
)
# from tensors
for key, value in prediction_dict.items():
prediction_dict[key] = value.numpy()[:1] if single_example else value.numpy()
result.update(prediction_dict)
# merge for single table, index is unique identifier for samples.
gene_expression_metadata["IC50 (min/max scaled)"] = prediction_dict["IC50"]
gene_expression_metadata["IC50 (log(μmol))"] = prediction_dict[
"log_micromolar_IC50"
]
if estimate_confidence:
gene_expression_metadata["epistemic_confidence"] = prediction_dict[
"epistemic_confidence"
]
gene_expression_metadata["aleatoric_confidence"] = prediction_dict[
"aleatoric_confidence"
]
logger.debug("uploaded predicted sensitivity table including metadata.")
# attention
# result.update(
# upload_attention(
# prefix,
# sample_names=list(map(str, gene_expression_metadata.index)),
# omic_attention=prediction_dict["gene_attention"],
# smiles_attention=prediction_dict["smiles_attention"],
# )
# )
logger.debug("uploaded attention for each sample.")
logger.debug("uploading drug information and sensitivity.")
# prediction (is sensitivity_json in API)
logger.debug("uploaded drug information and sensitivity.")
# NOTE: Ordering corresponds to IDs in GEP metadata!
return result