"""Configuration utils.""" import os import json import dill import pandas as pd from pytoda.transforms import Compose from pytoda.smiles.transforms import SMILESToTokenIndexes, LeftPadding, Canonicalization from cos import ensure_filepath_from_uri, COS_BUCKET_URI # model files MODEL_WEIGHTS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.pt")) MODEL_PARAMS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.json")) # SMILES language file SMILES_LANGUAGE_URI = ensure_filepath_from_uri( os.path.join(COS_BUCKET_URI, "smiles_language.pkl") ) # gene expression file GENE_EXPRESSION_URI = ensure_filepath_from_uri( os.path.join(COS_BUCKET_URI, "gene_expression.csv.zip") ) # genes file GENES_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "genes.pkl")) # genes standardization parameters GENE_EXPRESSION_STANDARDIZATION_URI = ensure_filepath_from_uri( os.path.join(COS_BUCKET_URI, "gene_expression_standardization.pkl") ) # load the model with open(MODEL_PARAMS_URI) as fp: MODEL_PARAMS = json.load(fp) MAX_LENGTH = MODEL_PARAMS["smiles_padding_length"] # load SMILES language with open(SMILES_LANGUAGE_URI, "rb") as fp: SMILES_LANGUAGE = dill.load(fp) # load gene expression GENE_EXPRESSION = pd.read_csv(GENE_EXPRESSION_URI, compression="zip", low_memory=False) # load genes with open(GENES_URI, "rb") as fp: GENES = dill.load(fp) # load gene standardization parameters with open(GENE_EXPRESSION_STANDARDIZATION_URI, "rb") as fp: GENE_STANDARDIZATION_PARAMETERS = dill.load(fp) # smiles transformations SMILES_TRANSFORMS = [ Canonicalization(), SMILESToTokenIndexes(smiles_language=SMILES_LANGUAGE), LeftPadding(padding_length=MAX_LENGTH, padding_index=SMILES_LANGUAGE.padding_index), ] SMILES_TOKENIZE_FN = Compose(SMILES_TRANSFORMS) # prepare default gene expression data # NOTE: transpose and reset work around to ensure we have all needed genes GENE_EXPRESSION_DATA = GENE_EXPRESSION.T.reindex(GENES).fillna(0.0).T.values # NOTE: sub-selecting exisiting columns to remove all the genes to_drop = list(set(GENES) & set(GENE_EXPRESSION.columns)) GENE_EXPRESSION_METADATA = GENE_EXPRESSION.drop(to_drop, axis=1) del GENE_EXPRESSION # housekeeping RESULTS_EXPIRATION_SECONDS = float( os.environ.get( "PACCMANN_RESULTS_EXPIRATION_SECONDS", # every week 60 * 60 * 24 * 7, ) ) # SMILES parameters # TODO: think whether we should enforce canonicalization CANON = { "canonical": MODEL_PARAMS["canonical"], "kekulize": MODEL_PARAMS["kekulize"], "all_bonds_explicit": MODEL_PARAMS["all_bonds_explicit"], "all_hs_explicit": MODEL_PARAMS["all_hs_explicit"], "randomize": MODEL_PARAMS["randomize"], "remove_bonddir": MODEL_PARAMS["remove_bonddir"], "smiles_maximum_length": MODEL_PARAMS["smiles_padding_length"], }