import logging import os import pathlib import tempfile from typing import List, Optional import gradio as gr import pandas as pd from rdkit import Chem from tqdm import tqdm from configuration import GENE_EXPRESSION_METADATA from submission import submission logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) site_mapper = { "central_nervous_system": "CNS", "haematopoietic_and_lymphoid_tissue": "Haema_lymph", "upper_aerodigestive_tract": "digestive", "autonomic_ganglia": "ganglia", } def run_inference( smiles: Optional[str], smiles_path: Optional[str], omic: Optional[str], confidence: bool, ): # Read SMILES if smiles == "" and smiles_path is None: raise TypeError("Pass either single SMILES or a file") elif smiles != "" and smiles_path is not None: raise TypeError("Pass either single SMILES or a file, not both") elif smiles != "": smiles = [smiles] elif smiles_path is not None: smiles_data = pd.read_csv(smiles_path.name, sep="\t", header=None) smiles = smiles_data[0] for smi in smiles: if Chem.MolFromSmiles(smi) is None: raise ValueError(f"Found invalid SMILES {smi}") # Read omics and otherwise load baseline if omic is not None: omic_path = omic.name else: omic_path = None result = pd.DataFrame({}) for smi in tqdm(smiles, total=len(smiles)): output = submission( drug={"smiles": smi}, workspace_id="emulated_workspace_id", task_id="emulated_task_id", estimate_confidence=confidence, omics_file=omic_path, ) # For the moment no attention analysis output.pop("gene_attention") output.pop("smiles_attention", None) output.pop("IC50") result[f"IC50_{smi}"] = output["log_micromolar_IC50"].squeeze().round(3) if confidence: result[f"aleatoric_confidence_{smi}"] = ( output["aleatoric_confidence"].squeeze().round(3) ) result[f"epistemic_confidence_{smi}"] = ( output["aleatoric_confidence"].squeeze().round(3) ) predicted_df = result # Prepare DF to visualize if omic_path is None: df = GENE_EXPRESSION_METADATA.copy() df.drop( [ "histology", "cell_line_name", "IC50 (min/max scaled)", "IC50 (log(μmol))", ], axis=1, inplace=True, ) df["site"] = df["site"].apply(lambda x: site_mapper.get(x, x)) df["cell_line"] = df["cell_line"].apply(lambda x: x.split("_")[0]) if (not confidence) and "aleatoric_confidence" in df.columns: df.drop( ["aleatoric_confidence", "epistemic_confidence"], axis=1, inplace=True ) if (not confidence) and "aleatoric_confidence" in predicted_df.columns: predicted_df.drop( ["aleatoric_confidence", "epistemic_confidence"], axis=1, inplace=True ) # else: # df = pd.read_csv(omic_path, low_memory=False) result_df = pd.concat( [df["cell_line"], predicted_df, df.drop(["cell_line"], axis=1)], axis=1 ) else: result_df = predicted_df # Save to temporary dir temp_path = os.path.join(tempfile.gettempdir(), "paccmann_result.csv") result_df.to_csv(temp_path) return temp_path, result_df.head(25) if __name__ == "__main__": # Load metadata metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards") examples = [ ["COc1cc(O)c2c(c1)C=CCC(O)C(O)C(=O)C=CCC(C)OC2=O", None, None, False], ["COC1=C(C=C2C(=C1)N=CN=C2NC3=CC(=C(C=C3)F)Cl)OCCCN4CCOCC4", None, None, True], # [None, metadata_root.joinpath("molecules.smi"), None, False], ] with open(metadata_root.joinpath("article.md"), "r") as f: article = f.read() with open(metadata_root.joinpath("description.md"), "r") as f: description = f.read() demo = gr.Interface( fn=run_inference, title="PaccMann", inputs=[ gr.Textbox( label="SMILES", placeholder="COc1cc(O)c2c(c1)C=CCC(O)C(O)C(=O)C=CCC(C)OC2=O", lines=1, ), gr.File( file_types=[".smi", ".tsv"], label="Multiple SMILES", ), gr.File( file_types=[".csv"], label="Transcriptomics data file", ), gr.Radio(choices=[True, False], label="Estimate confidence", value=False), ], outputs=[ gr.File(label="Download full results"), gr.DataFrame(label="Preview of results for 25 cell lines"), ], article=article, description=description, examples=examples, ) demo.launch(debug=True, show_error=True)