Add functions for extracting gene embeddings, move state_embs_dict outside isp, fix bugs in isp

Browse files

Files changed (6) hide show

examples/in_silico_perturbation.ipynb +50 -11
geneformer/emb_extractor.py +477 -190
geneformer/in_silico_perturber.py +631 -1035
geneformer/in_silico_perturber_stats.py +631 -313
geneformer/perturber_utils.py +698 -0
setup.py +1 -1

examples/in_silico_perturbation.ipynb CHANGED Viewed

@@ -8,21 +8,62 @@
    "outputs": [],
    "source": [
     "from geneformer import InSilicoPerturber\n",
-    "from geneformer import InSilicoPerturberStats"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "67b44366-f255-4415-a865-6a27a8ffcce7",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "# in silico perturbation in deletion mode to determine genes whose \n",
-    "# deletion in the dilated cardiomyopathy (dcm) state significantly shifts\n",
-    "# the embedding towards non-failing (nf) state\n",
     "isp = InSilicoPerturber(perturb_type=\"delete\",\n",
     "                        perturb_rank_shift=None,\n",
     "                        genes_to_perturb=\"all\",\n",
@@ -32,11 +73,9 @@
     "                        num_classes=3,\n",
     "                        emb_mode=\"cell\",\n",
     "                        cell_emb_style=\"mean_pool\",\n",
-    "                        filter_data={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]},\n",
-    "                        cell_states_to_model={'state_key': 'disease', \n",
-    "                                              'start_state': 'dcm', \n",
-    "                                              'goal_state': 'nf', \n",
-    "                                              'alt_states': ['hcm']},\n",
     "                        max_ncells=2000,\n",
     "                        emb_layer=0,\n",
     "                        forward_batch_size=400,\n",
@@ -68,7 +107,7 @@
     "                                  genes_perturbed=\"all\",\n",
     "                                  combos=0,\n",
     "                                  anchor_gene=None,\n",
-    "                                  cell_states_to_model={\"disease\":([\"dcm\"],[\"nf\"],[\"hcm\"])})"
    ]
   },
   {

    "outputs": [],
    "source": [
     "from geneformer import InSilicoPerturber\n",
+    "from geneformer import InSilicoPerturberStats\n",
+    "from geneformer import EmbExtractor"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cbd6851c-060e-4967-b816-e605ffe58b23",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### in silico perturbation in deletion mode to determine genes whose deletion in the dilated cardiomyopathy (dcm) state significantly shifts the embedding towards non-failing (nf) state"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c53e98cd-c603-4878-82ba-db471181bb55",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# first obtain start, goal, and alt embedding positions\n",
+    "# this function was changed to be separate from perturb_data\n",
+    "# to avoid repeating calcuations when parallelizing perturb_data\n",
+    "cell_states_to_model={\"state_key\": \"disease\", \n",
+    "                      \"start_state\": \"dcm\", \n",
+    "                      \"goal_state\": \"nf\", \n",
+    "                      \"alt_states\": [\"hcm\"]}\n",
+    "\n",
+    "filter_data_dict={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]}\n",
+    "\n",
+    "embex = EmbExtractor(model_type=\"CellClassifier\",\n",
+    "                     num_classes=3,\n",
+    "                     filter_data=filter_data_dict,\n",
+    "                     max_ncells=1000,\n",
+    "                     emb_layer=0,\n",
+    "                     summary_stat=\"exact_mean\",\n",
+    "                     forward_batch_size=256,\n",
+    "                     nproc=16)\n",
+    "\n",
+    "state_embs_dict = embex.get_state_embs(cell_states_to_model,\n",
+    "                                       \"path/to/model\",\n",
+    "                                       \"path/to/input_data\",\n",
+    "                                       \"path/to/output_directory\",\n",
+    "                                       \"output_prefix\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "981e1190-62da-4543-b7d3-6e2a2d6a6d56",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
     "isp = InSilicoPerturber(perturb_type=\"delete\",\n",
     "                        perturb_rank_shift=None,\n",
     "                        genes_to_perturb=\"all\",\n",
     "                        num_classes=3,\n",
     "                        emb_mode=\"cell\",\n",
     "                        cell_emb_style=\"mean_pool\",\n",
+    "                        filter_data=filter_data_dict,\n",
+    "                        cell_states_to_model=cell_states_to_model,\n",
+    "                        state_embs_dict=state_embs_dict,\n",
     "                        max_ncells=2000,\n",
     "                        emb_layer=0,\n",
     "                        forward_batch_size=400,\n",
     "                                  genes_perturbed=\"all\",\n",
     "                                  combos=0,\n",
     "                                  anchor_gene=None,\n",
+    "                                  cell_states_to_model=cell_states_to_model)"
    ]
   },
   {

geneformer/emb_extractor.py CHANGED Viewed

@@ -7,66 +7,62 @@ Usage:
                        num_classes=3,
                        emb_mode="cell",
                        cell_emb_style="mean_pool",
                        filter_data={"cell_type":["cardiomyocyte"]},
                        max_ncells=1000,
                        max_ncells_to_plot=1000,
                        emb_layer=-1,
                        emb_label=["disease","cell_type"],
                        labels_to_plot=["disease","cell_type"],
-                       forward_batch_size=100,
                        nproc=16,
                        summary_stat=None)
   embs = embex.extract_embs("path/to/model",
                             "path/to/input_data",
                             "path/to/output_directory",
                             "output_prefix")
-  embex.plot_embs(embs=embs,
                   plot_style="heatmap",
                   output_directory="path/to/output_directory",
                   output_prefix="output_prefix")
 """
 # imports
 import logging
 import anndata
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-import pickle
-from tdigest import TDigest
 import scanpy as sc
 import seaborn as sns
 import torch
-from collections import Counter
-from pathlib import Path
 from tqdm.auto import trange
-from transformers import BertForMaskedLM, BertForTokenClassification, BertForSequenceClassification
 from .tokenizer import TOKEN_DICTIONARY_FILE
-from .in_silico_perturber import downsample_and_sort, \
-                                 gen_attention_mask, \
-                                 get_model_input_size, \
-                                 load_and_filter, \
-                                 load_model, \
-                                 mean_nonpadding_embs, \
-                                 pad_tensor_list, \
-                                 quant_layers
 logger = logging.getLogger(__name__)
 # extract embeddings
-def get_embs(model,
-             filtered_input_data,
-             emb_mode,
-             layer_to_quant,
-             pad_token_id,
-             forward_batch_size,
-             summary_stat):
-    model_input_size = get_model_input_size(model)
     total_batch_length = len(filtered_input_data)
     if summary_stat is None:
         embs_list = []
     elif summary_stat is not None:
@@ -74,69 +70,173 @@ def get_embs(model,
         example = filtered_input_data.select([i for i in range(1)])
         example.set_format(type="torch")
         emb_dims = test_emb(model, example["input_ids"], layer_to_quant)
-        # initiate tdigests for # of emb dims
-        embs_tdigests = [TDigest() for _ in range(emb_dims)]
-    for i in trange(0, total_batch_length, forward_batch_size):
-        max_range = min(i+forward_batch_size, total_batch_length)
         minibatch = filtered_input_data.select([i for i in range(i, max_range)])
-        max_len = max(minibatch["length"])
-        original_lens = torch.tensor(minibatch["length"]).to("cuda")
         minibatch.set_format(type="torch")
         input_data_minibatch = minibatch["input_ids"]
-        input_data_minibatch = pad_tensor_list(input_data_minibatch,
-                                               max_len,
-                                               pad_token_id,
-                                               model_input_size)
         with torch.no_grad():
             outputs = model(
-                input_ids = input_data_minibatch.to("cuda"),
-                attention_mask = gen_attention_mask(minibatch)
             )
         embs_i = outputs.hidden_states[layer_to_quant]
         if emb_mode == "cell":
-            mean_embs = mean_nonpadding_embs(embs_i, original_lens)
             if summary_stat is None:
-                embs_list += [mean_embs]
             elif summary_stat is not None:
                 # update tdigests with current batch for each emb dim
-                # note: tdigest batch update known to be slow so updating serially
-                [embs_tdigests[j].update(mean_embs[i,j].item()) for i in range(mean_embs.size(0)) for j in range(emb_dims)]
         del outputs
         del minibatch
         del input_data_minibatch
         del embs_i
-        del mean_embs
-        torch.cuda.empty_cache()
     if summary_stat is None:
-        embs_stack = torch.cat(embs_list)
     # calculate summary stat embs from approximated tdigests
     elif summary_stat is not None:
-        if summary_stat == "mean":
-            summary_emb_list = [embs_tdigests[i].trimmed_mean(0,100) for i in range(emb_dims)]
-        elif summary_stat == "median":
-            summary_emb_list = [embs_tdigests[i].percentile(50) for i in range(emb_dims)]
-        embs_stack = torch.tensor(summary_emb_list)
     return embs_stack
 def test_emb(model, example, layer_to_quant):
     with torch.no_grad():
-        outputs = model(
-            input_ids = example.to("cuda")
-        )
     embs_test = outputs.hidden_states[layer_to_quant]
     return embs_test.size()[2]
-def label_embs(embs, downsampled_data, emb_labels):
     embs_df = pd.DataFrame(embs.cpu().numpy())
     if emb_labels is not None:
         for label in emb_labels:
@@ -144,94 +244,145 @@ def label_embs(embs, downsampled_data, emb_labels):
             embs_df[label] = emb_label
     return embs_df
 def plot_umap(embs_df, emb_dims, label, output_file, kwargs_dict):
-    only_embs_df = embs_df.iloc[:,:emb_dims]
     only_embs_df.index = pd.RangeIndex(0, only_embs_df.shape[0], name=None).astype(str)
-    only_embs_df.columns = pd.RangeIndex(0, only_embs_df.shape[1], name=None).astype(str)
     vars_dict = {"embs": only_embs_df.columns}
-    obs_dict = {"cell_id": list(only_embs_df.index),
-                f"{label}": list(embs_df[label])}
     adata = anndata.AnnData(X=only_embs_df, obs=obs_dict, var=vars_dict)
-    sc.tl.pca(adata, svd_solver='arpack')
     sc.pp.neighbors(adata)
     sc.tl.umap(adata)
-    sns.set(rc={'figure.figsize':(10,10)}, font_scale=2.3)
     sns.set_style("white")
-    default_kwargs_dict = {"palette":"Set2", "size":200}
     if kwargs_dict is not None:
         default_kwargs_dict.update(kwargs_dict)
     sc.pl.umap(adata, color=label, save=output_file, **default_kwargs_dict)
 def gen_heatmap_class_colors(labels, df):
-    pal = sns.cubehelix_palette(len(Counter(labels).keys()), light=0.9, dark=0.1, hue=1, reverse=True, start=1, rot=-2)
     lut = dict(zip(map(str, Counter(labels).keys()), pal))
     colors = pd.Series(labels, index=df.index).map(lut)
     return colors
 def gen_heatmap_class_dict(classes, label_colors_series):
-    class_color_dict_df = pd.DataFrame({"classes": classes, "color": label_colors_series})
     class_color_dict_df = class_color_dict_df.drop_duplicates(subset=["classes"])
-    return dict(zip(class_color_dict_df["classes"],class_color_dict_df["color"]))
-def make_colorbar(embs_df, label):
     labels = list(embs_df[label])
     cell_type_colors = gen_heatmap_class_colors(labels, embs_df)
     label_colors = pd.DataFrame(cell_type_colors, columns=[label])
-    for i,row in label_colors.iterrows():
-        colors=row[0]
-        if len(colors)!=3 or any(np.isnan(colors)):
-            print(i,colors)
     label_colors.isna().sum()
     # create dictionary for colors and classes
     label_color_dict = gen_heatmap_class_dict(labels, label_colors[label])
     return label_colors, label_color_dict
 def plot_heatmap(embs_df, emb_dims, label, output_file, kwargs_dict):
     sns.set_style("white")
     sns.set(font_scale=2)
     plt.figure(figsize=(15, 15), dpi=150)
     label_colors, label_color_dict = make_colorbar(embs_df, label)
-    default_kwargs_dict = {"row_cluster": True,
-                           "col_cluster": True,
-                           "row_colors": label_colors,
-                           "standard_scale":  1,
-                           "linewidths": 0,
-                           "xticklabels": False,
-                           "yticklabels": False,
-                           "figsize": (15,15),
-                           "center": 0,
-                           "cmap": "magma"}
     if kwargs_dict is not None:
         default_kwargs_dict.update(kwargs_dict)
-    g = sns.clustermap(embs_df.iloc[:,0:emb_dims].apply(pd.to_numeric), **default_kwargs_dict)
     plt.setp(g.ax_row_colors.get_xmajorticklabels(), rotation=45, ha="right")
     for label_color in list(label_color_dict.keys()):
-        g.ax_col_dendrogram.bar(0, 0, color=label_color_dict[label_color], label=label_color, linewidth=0)
-        l1 = g.ax_col_dendrogram.legend(title=f"{label}",
-                                        loc="lower center",
-                                        ncol=4,
-                                        bbox_to_anchor=(0.5, 1),
-                                        facecolor="white")
-    plt.savefig(output_file, bbox_inches='tight')
 class EmbExtractor:
     valid_option_dict = {
-        "model_type": {"Pretrained","GeneClassifier","CellClassifier"},
         "num_classes": {int},
-        "emb_mode": {"cell","gene"},
         "cell_emb_style": {"mean_pool"},
         "filter_data": {None, dict},
         "max_ncells": {None, int},
         "emb_layer": {-1, 0},
@@ -239,14 +390,16 @@ class EmbExtractor:
         "labels_to_plot": {None, list},
         "forward_batch_size": {int},
         "nproc": {int},
-        "summary_stat": {None, "mean", "median"},
     }
     def __init__(
         self,
         model_type="Pretrained",
         num_classes=0,
         emb_mode="cell",
         cell_emb_style="mean_pool",
         filter_data=None,
         max_ncells=1000,
         emb_layer=-1,
@@ -272,6 +425,9 @@ class EmbExtractor:
         cell_emb_style : "mean_pool"
             Method for summarizing cell embeddings.
             Currently only option is mean pooling of gene embeddings for given cell.
         filter_data : None, dict
             Default is to extract embeddings from all input data.
             Otherwise, dictionary specifying .dataset column name and list of values to filter by.
@@ -296,10 +452,11 @@ class EmbExtractor:
             Batch size for forward pass.
         nproc : int
             Number of CPU processes to use.
-        summary_stat : {None, "mean", "median"}
-            If not None, outputs only approximated mean or median embedding of input data.
-            Recommended if encountering memory constraints while generating goal embedding positions.
-            Slower but more memory-efficient.
         token_dictionary_file : Path
             Path to pickle file containing token dictionary (Ensembl ID:token).
         """
@@ -308,6 +465,7 @@ class EmbExtractor:
         self.num_classes = num_classes
         self.emb_mode = emb_mode
         self.cell_emb_style = cell_emb_style
         self.filter_data = filter_data
         self.max_ncells = max_ncells
         self.emb_layer = emb_layer
@@ -315,7 +473,12 @@ class EmbExtractor:
         self.labels_to_plot = labels_to_plot
         self.forward_batch_size = forward_batch_size
         self.nproc = nproc
-        self.summary_stat = summary_stat
         self.validate_options()
@@ -323,51 +486,49 @@ class EmbExtractor:
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
         self.pad_token_id = self.gene_token_dict.get("<pad>")
     def validate_options(self):
-        # first disallow options under development
-        if self.emb_mode == "gene":
-            logger.error(
-                "Extraction and plotting of gene-level embeddings currently under development. " \
-                "Current valid option for 'emb_mode': 'cell'"
-            )
-            raise
         # confirm arguments are within valid options and compatible with each other
-        for attr_name,valid_options in self.valid_option_dict.items():
             attr_value = self.__dict__[attr_name]
-            if type(attr_value) not in {list, dict}:
                 if attr_value in valid_options:
                     continue
             valid_type = False
             for option in valid_options:
-                if (option in [int,list,dict]) and isinstance(attr_value, option):
                     valid_type = True
                     break
             if valid_type:
                 continue
             logger.error(
-                f"Invalid option for {attr_name}. " \
                 f"Valid options for {attr_name}: {valid_options}"
             )
             raise
         if self.filter_data is not None:
-            for key,value in self.filter_data.items():
-                if type(value) != list:
                     self.filter_data[key] = [value]
                     logger.warning(
-                        "Values in filter_data dict must be lists. " \
-                        f"Changing {key} value to list ([{value}]).")
-    def extract_embs(self,
-                     model_directory,
-                     input_data_file,
-                     output_directory,
-                     output_prefix,
-                     output_torch_embs=False):
         """
         Extract embeddings from input data and save as results in output_directory.
@@ -384,42 +545,165 @@ class EmbExtractor:
         output_torch_embs : bool
             Whether or not to also output the embeddings as a tensor.
             Note, if true, will output embeddings as both dataframe and tensor.
         """
-        filtered_input_data = load_and_filter(self.filter_data, self.nproc, input_data_file)
-        downsampled_data = downsample_and_sort(filtered_input_data, self.max_ncells)
-        model = load_model(self.model_type, self.num_classes, model_directory)
-        layer_to_quant = quant_layers(model)+self.emb_layer
-        embs = get_embs(model,
-                        downsampled_data,
-                        self.emb_mode,
-                        layer_to_quant,
-                        self.pad_token_id,
-                        self.forward_batch_size,
-                        self.summary_stat)
-        if self.summary_stat is None:
-            embs_df = label_embs(embs, downsampled_data, self.emb_label)
-        elif self.summary_stat is not None:
-            embs_df = pd.DataFrame(embs.cpu().numpy()).T
         # save embeddings to output_path
-        output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")
-        embs_df.to_csv(output_path)
-        if output_torch_embs == True:
-            return embs_df, embs
         else:
-            return embs_df
-    def plot_embs(self,
-                  embs,
-                  plot_style,
-                  output_directory,
-                  output_prefix,
-                  max_ncells_to_plot=1000,
-                  kwargs_dict=None):
         """
         Plot embeddings, coloring by provided labels.
@@ -440,60 +724,63 @@ class EmbExtractor:
         kwargs_dict : dict
             Dictionary of kwargs to pass to plotting function.
         """
-        if plot_style not in ["heatmap","umap"]:
             logger.error(
-                "Invalid option for 'plot_style'. " \
-                "Valid options: {'heatmap','umap'}"
             )
             raise
         if (plot_style == "umap") and (self.labels_to_plot is None):
-            logger.error(
-                "Plotting UMAP requires 'labels_to_plot'. "
-            )
             raise
         if max_ncells_to_plot > self.max_ncells:
             max_ncells_to_plot = self.max_ncells
             logger.warning(
-                "max_ncells_to_plot must be <= max_ncells. " \
-                f"Changing max_ncells_to_plot to {self.max_ncells}.")
-        if (max_ncells_to_plot is not None) \
-            and (max_ncells_to_plot < self.max_ncells):
             embs = embs.sample(max_ncells_to_plot, axis=0)
         if self.emb_label is None:
             label_len = 0
         else:
             label_len = len(self.emb_label)
         emb_dims = embs.shape[1] - label_len
         if self.emb_label is None:
             emb_labels = None
         else:
             emb_labels = embs.columns[emb_dims:]
         if plot_style == "umap":
             for label in self.labels_to_plot:
                 if label not in emb_labels:
                     logger.warning(
-                        f"Label {label} from labels_to_plot " \
-                        f"not present in provided embeddings dataframe.")
                     continue
                 output_prefix_label = "_" + output_prefix + f"_umap_{label}"
-                output_file = (Path(output_directory) / output_prefix_label).with_suffix(".pdf")
                 plot_umap(embs, emb_dims, label, output_prefix_label, kwargs_dict)
         if plot_style == "heatmap":
             for label in self.labels_to_plot:
                 if label not in emb_labels:
                     logger.warning(
-                        f"Label {label} from labels_to_plot " \
-                        f"not present in provided embeddings dataframe.")
                     continue
                 output_prefix_label = output_prefix + f"_heatmap_{label}"
-                output_file = (Path(output_directory) / output_prefix_label).with_suffix(".pdf")
-                plot_heatmap(embs, emb_dims, label, output_file, kwargs_dict)

                        num_classes=3,
                        emb_mode="cell",
                        cell_emb_style="mean_pool",
+                       gene_emb_style="mean_pool",
                        filter_data={"cell_type":["cardiomyocyte"]},
                        max_ncells=1000,
                        max_ncells_to_plot=1000,
                        emb_layer=-1,
                        emb_label=["disease","cell_type"],
                        labels_to_plot=["disease","cell_type"],
                        nproc=16,
                        summary_stat=None)
   embs = embex.extract_embs("path/to/model",
                             "path/to/input_data",
                             "path/to/output_directory",
                             "output_prefix")
+  embex.plot_embs(embs=embs,
                   plot_style="heatmap",
                   output_directory="path/to/output_directory",
                   output_prefix="output_prefix")
 """
 # imports
 import logging
+import pickle
+from collections import Counter
+from pathlib import Path
 import anndata
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import scanpy as sc
 import seaborn as sns
 import torch
+from tdigest import TDigest
 from tqdm.auto import trange
+from . import perturber_utils as pu
 from .tokenizer import TOKEN_DICTIONARY_FILE
 logger = logging.getLogger(__name__)
 # extract embeddings
+def get_embs(
+    model,
+    filtered_input_data,
+    emb_mode,
+    layer_to_quant,
+    pad_token_id,
+    forward_batch_size,
+    summary_stat=None,
+    silent=False,
+):
+    model_input_size = pu.get_model_input_size(model)
     total_batch_length = len(filtered_input_data)
     if summary_stat is None:
         embs_list = []
     elif summary_stat is not None:
         example = filtered_input_data.select([i for i in range(1)])
         example.set_format(type="torch")
         emb_dims = test_emb(model, example["input_ids"], layer_to_quant)
+        if emb_mode == "cell":
+            # initiate tdigests for # of emb dims
+            embs_tdigests = [TDigest() for _ in range(emb_dims)]
+        if emb_mode == "gene":
+            gene_set = list(
+                {
+                    element
+                    for sublist in filtered_input_data["input_ids"]
+                    for element in sublist
+                }
+            )
+            # initiate dict with genes as keys and tdigests for # of emb dims as values
+            embs_tdigests_dict = {
+                k: [TDigest() for _ in range(emb_dims)] for k in gene_set
+            }
+    overall_max_len = 0
+    for i in trange(0, total_batch_length, forward_batch_size, leave=(not silent)):
+        max_range = min(i + forward_batch_size, total_batch_length)
         minibatch = filtered_input_data.select([i for i in range(i, max_range)])
+        max_len = int(max(minibatch["length"]))
+        original_lens = torch.tensor(minibatch["length"], device="cuda")
         minibatch.set_format(type="torch")
         input_data_minibatch = minibatch["input_ids"]
+        input_data_minibatch = pu.pad_tensor_list(
+            input_data_minibatch, max_len, pad_token_id, model_input_size
+        )
         with torch.no_grad():
             outputs = model(
+                input_ids=input_data_minibatch.to("cuda"),
+                attention_mask=pu.gen_attention_mask(minibatch),
             )
         embs_i = outputs.hidden_states[layer_to_quant]
         if emb_mode == "cell":
+            mean_embs = pu.mean_nonpadding_embs(embs_i, original_lens)
             if summary_stat is None:
+                embs_list.append(mean_embs)
             elif summary_stat is not None:
                 # update tdigests with current batch for each emb dim
+                accumulate_tdigests(embs_tdigests, mean_embs, emb_dims)
+            del mean_embs
+        elif emb_mode == "gene":
+            if summary_stat is None:
+                embs_list.append(embs_i)
+            elif summary_stat is not None:
+                for h in trange(len(minibatch)):
+                    length_h = minibatch[h]["length"]
+                    input_ids_h = minibatch[h]["input_ids"][0:length_h]
+                    # double check dimensions before unsqueezing
+                    embs_i_dim = embs_i.dim()
+                    if embs_i_dim != 3:
+                        logger.error(
+                            f"Embedding tensor should have 3 dimensions, not {embs_i_dim}"
+                        )
+                        raise
+                    embs_h = embs_i[h, :, :].unsqueeze(dim=1)
+                    dict_h = dict(zip(input_ids_h, embs_h))
+                    for k in dict_h.keys():
+                        accumulate_tdigests(
+                            embs_tdigests_dict[int(k)], dict_h[k], emb_dims
+                        )
+        overall_max_len = max(overall_max_len, max_len)
         del outputs
         del minibatch
         del input_data_minibatch
         del embs_i
+        torch.cuda.empty_cache()
     if summary_stat is None:
+        if emb_mode == "cell":
+            embs_stack = torch.cat(embs_list, dim=0)
+        elif emb_mode == "gene":
+            embs_stack = pu.pad_tensor_list(
+                embs_list,
+                overall_max_len,
+                pad_token_id,
+                model_input_size,
+                1,
+                pu.pad_3d_tensor,
+            )
     # calculate summary stat embs from approximated tdigests
     elif summary_stat is not None:
+        if emb_mode == "cell":
+            if summary_stat == "mean":
+                summary_emb_list = tdigest_mean(embs_tdigests, emb_dims)
+            elif summary_stat == "median":
+                summary_emb_list = tdigest_median(embs_tdigests, emb_dims)
+            embs_stack = torch.tensor(summary_emb_list)
+        elif emb_mode == "gene":
+            if summary_stat == "mean":
+                [
+                    update_tdigest_dict_mean(embs_tdigests_dict, gene, emb_dims)
+                    for gene in embs_tdigests_dict.keys()
+                ]
+            elif summary_stat == "median":
+                [
+                    update_tdigest_dict_median(embs_tdigests_dict, gene, emb_dims)
+                    for gene in embs_tdigests_dict.keys()
+                ]
+            return embs_tdigests_dict
     return embs_stack
+def accumulate_tdigests(embs_tdigests, mean_embs, emb_dims):
+    # note: tdigest batch update known to be slow so updating serially
+    [
+        embs_tdigests[j].update(mean_embs[i, j].item())
+        for i in range(mean_embs.size(0))
+        for j in range(emb_dims)
+    ]
+def update_tdigest_dict(embs_tdigests_dict, gene, gene_embs, emb_dims):
+    embs_tdigests_dict[gene] = accumulate_tdigests(
+        embs_tdigests_dict[gene], gene_embs, emb_dims
+    )
+def update_tdigest_dict_mean(embs_tdigests_dict, gene, emb_dims):
+    embs_tdigests_dict[gene] = tdigest_mean(embs_tdigests_dict[gene], emb_dims)
+def update_tdigest_dict_median(embs_tdigests_dict, gene, emb_dims):
+    embs_tdigests_dict[gene] = tdigest_median(embs_tdigests_dict[gene], emb_dims)
+def summarize_gene_embs(h, minibatch, embs_i, embs_tdigests_dict, emb_dims):
+    length_h = minibatch[h]["length"]
+    input_ids_h = minibatch[h]["input_ids"][0:length_h]
+    embs_h = embs_i[h, :, :].unsqueeze(dim=1)
+    dict_h = dict(zip(input_ids_h, embs_h))
+    [
+        update_tdigest_dict(embs_tdigests_dict, k, dict_h[k], emb_dims)
+        for k in dict_h.keys()
+    ]
+def tdigest_mean(embs_tdigests, emb_dims):
+    return [embs_tdigests[i].trimmed_mean(0, 100) for i in range(emb_dims)]
+def tdigest_median(embs_tdigests, emb_dims):
+    return [embs_tdigests[i].percentile(50) for i in range(emb_dims)]
 def test_emb(model, example, layer_to_quant):
     with torch.no_grad():
+        outputs = model(input_ids=example.to("cuda"))
     embs_test = outputs.hidden_states[layer_to_quant]
     return embs_test.size()[2]
+def label_cell_embs(embs, downsampled_data, emb_labels):
     embs_df = pd.DataFrame(embs.cpu().numpy())
     if emb_labels is not None:
         for label in emb_labels:
             embs_df[label] = emb_label
     return embs_df
+def label_gene_embs(embs, downsampled_data, token_gene_dict):
+    gene_set = {
+        element for sublist in downsampled_data["input_ids"] for element in sublist
+    }
+    gene_emb_dict = {k: [] for k in gene_set}
+    for i in range(embs.size()[0]):
+        length = downsampled_data[i]["length"]
+        dict_i = dict(
+            zip(
+                downsampled_data[i]["input_ids"][0:length],
+                embs[i, :, :].unsqueeze(dim=1),
+            )
+        )
+        for k in dict_i.keys():
+            gene_emb_dict[k].append(dict_i[k])
+    for k in gene_emb_dict.keys():
+        gene_emb_dict[k] = (
+            torch.squeeze(torch.mean(torch.stack(gene_emb_dict[k]), dim=0), dim=0)
+            .cpu()
+            .numpy()
+        )
+    embs_df = pd.DataFrame(gene_emb_dict).T
+    embs_df.index = [token_gene_dict[token] for token in embs_df.index]
+    return embs_df
 def plot_umap(embs_df, emb_dims, label, output_file, kwargs_dict):
+    only_embs_df = embs_df.iloc[:, :emb_dims]
     only_embs_df.index = pd.RangeIndex(0, only_embs_df.shape[0], name=None).astype(str)
+    only_embs_df.columns = pd.RangeIndex(0, only_embs_df.shape[1], name=None).astype(
+        str
+    )
     vars_dict = {"embs": only_embs_df.columns}
+    obs_dict = {"cell_id": list(only_embs_df.index), f"{label}": list(embs_df[label])}
     adata = anndata.AnnData(X=only_embs_df, obs=obs_dict, var=vars_dict)
+    sc.tl.pca(adata, svd_solver="arpack")
     sc.pp.neighbors(adata)
     sc.tl.umap(adata)
+    sns.set(rc={"figure.figsize": (10, 10)}, font_scale=2.3)
     sns.set_style("white")
+    default_kwargs_dict = {"palette": "Set2", "size": 200}
     if kwargs_dict is not None:
         default_kwargs_dict.update(kwargs_dict)
     sc.pl.umap(adata, color=label, save=output_file, **default_kwargs_dict)
 def gen_heatmap_class_colors(labels, df):
+    pal = sns.cubehelix_palette(
+        len(Counter(labels).keys()),
+        light=0.9,
+        dark=0.1,
+        hue=1,
+        reverse=True,
+        start=1,
+        rot=-2,
+    )
     lut = dict(zip(map(str, Counter(labels).keys()), pal))
     colors = pd.Series(labels, index=df.index).map(lut)
     return colors
 def gen_heatmap_class_dict(classes, label_colors_series):
+    class_color_dict_df = pd.DataFrame(
+        {"classes": classes, "color": label_colors_series}
+    )
     class_color_dict_df = class_color_dict_df.drop_duplicates(subset=["classes"])
+    return dict(zip(class_color_dict_df["classes"], class_color_dict_df["color"]))
+def make_colorbar(embs_df, label):
     labels = list(embs_df[label])
     cell_type_colors = gen_heatmap_class_colors(labels, embs_df)
     label_colors = pd.DataFrame(cell_type_colors, columns=[label])
+    for i, row in label_colors.iterrows():
+        colors = row[0]
+        if len(colors) != 3 or any(np.isnan(colors)):
+            print(i, colors)
     label_colors.isna().sum()
     # create dictionary for colors and classes
     label_color_dict = gen_heatmap_class_dict(labels, label_colors[label])
     return label_colors, label_color_dict
 def plot_heatmap(embs_df, emb_dims, label, output_file, kwargs_dict):
     sns.set_style("white")
     sns.set(font_scale=2)
     plt.figure(figsize=(15, 15), dpi=150)
     label_colors, label_color_dict = make_colorbar(embs_df, label)
+    default_kwargs_dict = {
+        "row_cluster": True,
+        "col_cluster": True,
+        "row_colors": label_colors,
+        "standard_scale": 1,
+        "linewidths": 0,
+        "xticklabels": False,
+        "yticklabels": False,
+        "figsize": (15, 15),
+        "center": 0,
+        "cmap": "magma",
+    }
     if kwargs_dict is not None:
         default_kwargs_dict.update(kwargs_dict)
+    g = sns.clustermap(
+        embs_df.iloc[:, 0:emb_dims].apply(pd.to_numeric), **default_kwargs_dict
+    )
     plt.setp(g.ax_row_colors.get_xmajorticklabels(), rotation=45, ha="right")
     for label_color in list(label_color_dict.keys()):
+        g.ax_col_dendrogram.bar(
+            0, 0, color=label_color_dict[label_color], label=label_color, linewidth=0
+        )
+        g.ax_col_dendrogram.legend(
+            title=f"{label}",
+            loc="lower center",
+            ncol=4,
+            bbox_to_anchor=(0.5, 1),
+            facecolor="white",
+        )
+    plt.savefig(output_file, bbox_inches="tight")
 class EmbExtractor:
     valid_option_dict = {
+        "model_type": {"Pretrained", "GeneClassifier", "CellClassifier"},
         "num_classes": {int},
+        "emb_mode": {"cell", "gene"},
         "cell_emb_style": {"mean_pool"},
+        "gene_emb_style": {"mean_pool"},
         "filter_data": {None, dict},
         "max_ncells": {None, int},
         "emb_layer": {-1, 0},
         "labels_to_plot": {None, list},
         "forward_batch_size": {int},
         "nproc": {int},
+        "summary_stat": {None, "mean", "median", "exact_mean", "exact_median"},
     }
     def __init__(
         self,
         model_type="Pretrained",
         num_classes=0,
         emb_mode="cell",
         cell_emb_style="mean_pool",
+        gene_emb_style="mean_pool",
         filter_data=None,
         max_ncells=1000,
         emb_layer=-1,
         cell_emb_style : "mean_pool"
             Method for summarizing cell embeddings.
             Currently only option is mean pooling of gene embeddings for given cell.
+        gene_emb_style : "mean_pool"
+            Method for summarizing gene embeddings.
+            Currently only option is mean pooling of contextual gene embeddings for given gene.
         filter_data : None, dict
             Default is to extract embeddings from all input data.
             Otherwise, dictionary specifying .dataset column name and list of values to filter by.
             Batch size for forward pass.
         nproc : int
             Number of CPU processes to use.
+        summary_stat : {None, "mean", "median", "exact_mean", "exact_median"}
+            If exact_mean or exact_median, outputs only exact mean or median embedding of input data.
+            If mean or median, outputs only approximated mean or median embedding of input data.
+            Non-exact recommended if encountering memory constraints while generating goal embedding positions.
+            Non-exact is slower but more memory-efficient.
         token_dictionary_file : Path
             Path to pickle file containing token dictionary (Ensembl ID:token).
         """
         self.num_classes = num_classes
         self.emb_mode = emb_mode
         self.cell_emb_style = cell_emb_style
+        self.gene_emb_style = gene_emb_style
         self.filter_data = filter_data
         self.max_ncells = max_ncells
         self.emb_layer = emb_layer
         self.labels_to_plot = labels_to_plot
         self.forward_batch_size = forward_batch_size
         self.nproc = nproc
+        if (summary_stat is not None) and ("exact" in summary_stat):
+            self.summary_stat = None
+            self.exact_summary_stat = summary_stat
+        else:
+            self.summary_stat = summary_stat
+            self.exact_summary_stat = None
         self.validate_options()
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
+        self.token_gene_dict = {v: k for k, v in self.gene_token_dict.items()}
         self.pad_token_id = self.gene_token_dict.get("<pad>")
     def validate_options(self):
         # confirm arguments are within valid options and compatible with each other
+        for attr_name, valid_options in self.valid_option_dict.items():
             attr_value = self.__dict__[attr_name]
+            if not isinstance(attr_value, (list, dict)):
                 if attr_value in valid_options:
                     continue
             valid_type = False
             for option in valid_options:
+                if (option in [int, list, dict, bool]) and isinstance(
+                    attr_value, option
+                ):
                     valid_type = True
                     break
             if valid_type:
                 continue
             logger.error(
+                f"Invalid option for {attr_name}. "
                 f"Valid options for {attr_name}: {valid_options}"
             )
             raise
         if self.filter_data is not None:
+            for key, value in self.filter_data.items():
+                if not isinstance(value, list):
                     self.filter_data[key] = [value]
                     logger.warning(
+                        "Values in filter_data dict must be lists. "
+                        f"Changing {key} value to list ([{value}])."
+                    )
+    def extract_embs(
+        self,
+        model_directory,
+        input_data_file,
+        output_directory,
+        output_prefix,
+        output_torch_embs=False,
+        cell_state=None,
+    ):
         """
         Extract embeddings from input data and save as results in output_directory.
         output_torch_embs : bool
             Whether or not to also output the embeddings as a tensor.
             Note, if true, will output embeddings as both dataframe and tensor.
+        cell_state : dict
+            Cell state key and value for state embedding extraction.
         """
+        filtered_input_data = pu.load_and_filter(
+            self.filter_data, self.nproc, input_data_file
+        )
+        if cell_state is not None:
+            filtered_input_data = pu.filter_by_dict(
+                filtered_input_data, cell_state, self.nproc
+            )
+        downsampled_data = pu.downsample_and_sort(filtered_input_data, self.max_ncells)
+        model = pu.load_model(self.model_type, self.num_classes, model_directory)
+        layer_to_quant = pu.quant_layers(model) + self.emb_layer
+        embs = get_embs(
+            model,
+            downsampled_data,
+            self.emb_mode,
+            layer_to_quant,
+            self.pad_token_id,
+            self.forward_batch_size,
+            self.summary_stat,
+        )
+        if self.emb_mode == "cell":
+            if self.summary_stat is None:
+                embs_df = label_cell_embs(embs, downsampled_data, self.emb_label)
+            elif self.summary_stat is not None:
+                embs_df = pd.DataFrame(embs.cpu().numpy()).T
+        elif self.emb_mode == "gene":
+            if self.summary_stat is None:
+                embs_df = label_gene_embs(embs, downsampled_data, self.token_gene_dict)
+            elif self.summary_stat is not None:
+                embs_df = pd.DataFrame(embs).T
+                embs_df.index = [self.token_gene_dict[token] for token in embs_df.index]
         # save embeddings to output_path
+        if cell_state is None:
+            output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")
+            embs_df.to_csv(output_path)
+        if self.exact_summary_stat == "exact_mean":
+            embs = embs.mean(dim=0)
+            embs_df = pd.DataFrame(
+                embs_df[0:255].mean(axis="rows"), columns=[self.exact_summary_stat]
+            ).T
+        elif self.exact_summary_stat == "exact_median":
+            embs = torch.median(embs, dim=0)[0]
+            embs_df = pd.DataFrame(
+                embs_df[0:255].median(axis="rows"), columns=[self.exact_summary_stat]
+            ).T
+        if cell_state is not None:
+            return embs
         else:
+            if output_torch_embs:
+                return embs_df, embs
+            else:
+                return embs_df
+    def get_state_embs(
+        self,
+        cell_states_to_model,
+        model_directory,
+        input_data_file,
+        output_directory,
+        output_prefix,
+        output_torch_embs=True,
+    ):
+        """
+        Extract exact mean or exact median cell state embedding positions from input data and save as results in output_directory.
+        Parameters
+        ----------
+        cell_states_to_model : None, dict
+            Cell states to model if testing perturbations that achieve goal state change.
+            Four-item dictionary with keys: state_key, start_state, goal_state, and alt_states
+            state_key: key specifying name of column in .dataset that defines the start/goal states
+            start_state: value in the state_key column that specifies the start state
+            goal_state: value in the state_key column taht specifies the goal end state
+            alt_states: list of values in the state_key column that specify the alternate end states
+            For example: {"state_key": "disease",
+                          "start_state": "dcm",
+                          "goal_state": "nf",
+                          "alt_states": ["hcm", "other1", "other2"]}
+        model_directory : Path
+            Path to directory containing model
+        input_data_file : Path
+            Path to directory containing .dataset inputs
+        output_directory : Path
+            Path to directory where embedding data will be saved as csv
+        output_prefix : str
+            Prefix for output file
+        output_torch_embs : bool
+            Whether or not to also output the embeddings as a tensor.
+            Note, if true, will output embeddings as both dataframe and tensor.
+        Outputs
+        ----------
+        Outputs state_embs_dict for use with in silico perturber.
+        Format is dictionary of embedding positions of each cell state to model shifts from/towards.
+        Keys specify each possible cell state to model.
+        Values are target embedding positions as torch.tensor.
+        For example: {"nf": emb_nf,
+                      "hcm": emb_hcm,
+                      "dcm": emb_dcm,
+                      "other1": emb_other1,
+                      "other2": emb_other2}
+        """
+        pu.validate_cell_states_to_model(cell_states_to_model)
+        valid_summary_stats = ["exact_mean", "exact_median"]
+        if self.exact_summary_stat not in valid_summary_stats:
+            logger.error(
+                "For extracting state embs, summary_stat in EmbExtractor "
+                f"must be set to option in {valid_summary_stats}"
+            )
+            raise
+        state_embs_dict = dict()
+        state_key = cell_states_to_model["state_key"]
+        for k, v in cell_states_to_model.items():
+            if k == "state_key":
+                continue
+            elif (k == "start_state") or (k == "goal_state"):
+                state_embs_dict[v] = self.extract_embs(
+                    model_directory,
+                    input_data_file,
+                    output_directory,
+                    output_prefix,
+                    output_torch_embs,
+                    cell_state={state_key: v},
+                )
+            else:  # k == "alt_states"
+                for alt_state in v:
+                    state_embs_dict[alt_state] = self.extract_embs(
+                        model_directory,
+                        input_data_file,
+                        output_directory,
+                        output_prefix,
+                        output_torch_embs,
+                        cell_state={state_key: alt_state},
+                    )
+        output_path = (Path(output_directory) / output_prefix).with_suffix(".pkl")
+        with open(output_path, "wb") as fp:
+            pickle.dump(state_embs_dict, fp)
+        return state_embs_dict
+    def plot_embs(
+        self,
+        embs,
+        plot_style,
+        output_directory,
+        output_prefix,
+        max_ncells_to_plot=1000,
+        kwargs_dict=None,
+    ):
         """
         Plot embeddings, coloring by provided labels.
         kwargs_dict : dict
             Dictionary of kwargs to pass to plotting function.
         """
+        if plot_style not in ["heatmap", "umap"]:
             logger.error(
+                "Invalid option for 'plot_style'. " "Valid options: {'heatmap','umap'}"
             )
             raise
         if (plot_style == "umap") and (self.labels_to_plot is None):
+            logger.error("Plotting UMAP requires 'labels_to_plot'. ")
             raise
         if max_ncells_to_plot > self.max_ncells:
             max_ncells_to_plot = self.max_ncells
             logger.warning(
+                "max_ncells_to_plot must be <= max_ncells. "
+                f"Changing max_ncells_to_plot to {self.max_ncells}."
+            )
+        if (max_ncells_to_plot is not None) and (max_ncells_to_plot < self.max_ncells):
             embs = embs.sample(max_ncells_to_plot, axis=0)
         if self.emb_label is None:
             label_len = 0
         else:
             label_len = len(self.emb_label)
         emb_dims = embs.shape[1] - label_len
         if self.emb_label is None:
             emb_labels = None
         else:
             emb_labels = embs.columns[emb_dims:]
         if plot_style == "umap":
             for label in self.labels_to_plot:
                 if label not in emb_labels:
                     logger.warning(
+                        f"Label {label} from labels_to_plot "
+                        f"not present in provided embeddings dataframe."
+                    )
                     continue
                 output_prefix_label = "_" + output_prefix + f"_umap_{label}"
+                output_file = (
+                    Path(output_directory) / output_prefix_label
+                ).with_suffix(".pdf")
                 plot_umap(embs, emb_dims, label, output_prefix_label, kwargs_dict)
         if plot_style == "heatmap":
             for label in self.labels_to_plot:
                 if label not in emb_labels:
                     logger.warning(
+                        f"Label {label} from labels_to_plot "
+                        f"not present in provided embeddings dataframe."
+                    )
                     continue
                 output_prefix_label = output_prefix + f"_heatmap_{label}"
+                output_file = (
+                    Path(output_directory) / output_prefix_label
+                ).with_suffix(".pdf")
+                plot_heatmap(embs, emb_dims, label, output_file, kwargs_dict)

geneformer/in_silico_perturber.py CHANGED Viewed

@@ -8,614 +8,66 @@ Usage:
                           genes_to_perturb="all",
                           combos=0,
                           anchor_gene=None,
-                          model_type="Pretrained",
                           num_classes=0,
                           emb_mode="cell",
                           cell_emb_style="mean_pool",
                           filter_data={"cell_type":["cardiomyocyte"]},
                           cell_states_to_model={"state_key": "disease", "start_state": "dcm", "goal_state": "nf", "alt_states": ["hcm", "other1", "other2"]},
                           max_ncells=None,
-                          emb_layer=-1,
                           forward_batch_size=100,
-                          nproc=4)
   isp.perturb_data("path/to/model",
                    "path/to/input_data",
                    "path/to/output_directory",
                    "output_prefix")
 """
-# imports
-import itertools as it
 import logging
-import numpy as np
 import pickle
-import re
-import seaborn as sns; sns.set()
-import torch
 from collections import defaultdict
-from datasets import Dataset, load_from_disk
 from tqdm.auto import trange
-from transformers import BertForMaskedLM, BertForTokenClassification, BertForSequenceClassification
 from .tokenizer import TOKEN_DICTIONARY_FILE
-logger = logging.getLogger(__name__)
-# load data and filter by defined criteria
-def load_and_filter(filter_data, nproc, input_data_file):
-    data = load_from_disk(input_data_file)
-    if filter_data is not None:
-        for key,value in filter_data.items():
-            def filter_data_by_criteria(example):
-                return example[key] in value
-            data = data.filter(filter_data_by_criteria, num_proc=nproc)
-        if len(data) == 0:
-            logger.error(
-                    "No cells remain after filtering. Check filtering criteria.")
-            raise
-    data_shuffled = data.shuffle(seed=42)
-    return data_shuffled
-# load model to GPU
-def load_model(model_type, num_classes, model_directory):
-    if model_type == "Pretrained":
-        model = BertForMaskedLM.from_pretrained(model_directory,
-                                                output_hidden_states=True,
-                                                output_attentions=False)
-    elif model_type == "GeneClassifier":
-        model = BertForTokenClassification.from_pretrained(model_directory,
-                                                num_labels=num_classes,
-                                                output_hidden_states=True,
-                                                output_attentions=False)
-    elif model_type == "CellClassifier":
-        model = BertForSequenceClassification.from_pretrained(model_directory,
-                                                num_labels=num_classes,
-                                                output_hidden_states=True,
-                                                output_attentions=False)
-    # put the model in eval mode for fwd pass
-    model.eval()
-    model = model.to("cuda:0")
-    return model
-def quant_layers(model):
-    layer_nums = []
-    for name, parameter in model.named_parameters():
-        if "layer" in name:
-            layer_nums += [int(name.split("layer.")[1].split(".")[0])]
-    return int(max(layer_nums))+1
-def get_model_input_size(model):
-    return int(re.split("\(|,",str(model.bert.embeddings.position_embeddings))[1])
-def flatten_list(megalist):
-    return [item for sublist in megalist for item in sublist]
-def measure_length(example):
-    example["length"] = len(example["input_ids"])
-    return example
-def downsample_and_sort(data_shuffled, max_ncells):
-    num_cells = len(data_shuffled)
-    # if max number of cells is defined, then subsample to this max number
-    if max_ncells != None:
-        num_cells = min(max_ncells,num_cells)
-    data_subset = data_shuffled.select([i for i in range(num_cells)])
-    # sort dataset with largest cell first to encounter any memory errors earlier
-    data_sorted = data_subset.sort("length",reverse=True)
-    return data_sorted
-def get_possible_states(cell_states_to_model):
-    possible_states = []
-    for key in ["start_state","goal_state"]:
-        possible_states += [cell_states_to_model[key]]
-    possible_states += cell_states_to_model.get("alt_states",[])
-    return possible_states
-def forward_pass_single_cell(model, example_cell, layer_to_quant):
-    example_cell.set_format(type="torch")
-    input_data = example_cell["input_ids"]
-    with torch.no_grad():
-        outputs = model(
-            input_ids = input_data.to("cuda")
-        )
-    emb = torch.squeeze(outputs.hidden_states[layer_to_quant])
-    del outputs
-    return emb
-def perturb_emb_by_index(emb, indices):
-    mask = torch.ones(emb.numel(), dtype=torch.bool)
-    mask[indices] = False
-    return emb[mask]
-def delete_indices(example):
-    indices = example["perturb_index"]
-    if any(isinstance(el, list) for el in indices):
-        indices = flatten_list(indices)
-    for index in sorted(indices, reverse=True):
-        del example["input_ids"][index]
-    return example
-# for genes_to_perturb = "all" where only genes within cell are overexpressed
-def overexpress_indices(example):
-    indices = example["perturb_index"]
-    if any(isinstance(el, list) for el in indices):
-        indices = flatten_list(indices)
-    for index in sorted(indices, reverse=True):
-        example["input_ids"].insert(0, example["input_ids"].pop(index))
-    return example
-# for genes_to_perturb = list of genes to overexpress that are not necessarily expressed in cell
-def overexpress_tokens(example):
-    # -100 indicates tokens to overexpress are not present in rank value encoding
-    if example["perturb_index"] != [-100]:
-        example = delete_indices(example)
-    [example["input_ids"].insert(0, token) for token in example["tokens_to_perturb"][::-1]]
-    return example
-def remove_indices_from_emb(emb, indices_to_remove, gene_dim):
-    # indices_to_remove is list of indices to remove
-    indices_to_keep = [i for i in range(emb.size()[gene_dim]) if i not in indices_to_remove]
-    num_dims = emb.dim()
-    emb_slice = [slice(None) if dim != gene_dim else indices_to_keep for dim in range(num_dims)]
-    sliced_emb = emb[emb_slice]
-    return sliced_emb
-def remove_indices_from_emb_batch(emb_batch, list_of_indices_to_remove, gene_dim):
-    output_batch = torch.stack([
-                    remove_indices_from_emb(emb_batch[i, :, :], idxs, gene_dim-1) for
-                    i, idxs in enumerate(list_of_indices_to_remove)
-                    ])
-    return output_batch
-def make_perturbation_batch(example_cell,
-                            perturb_type,
-                            tokens_to_perturb,
-                            anchor_token,
-                            combo_lvl,
-                            num_proc):
-    if tokens_to_perturb == "all":
-        if perturb_type in ["overexpress","activate"]:
-            range_start = 1
-        elif perturb_type in ["delete","inhibit"]:
-            range_start = 0
-        indices_to_perturb = [[i] for i in range(range_start, example_cell["length"][0])]
-    elif combo_lvl>0 and (anchor_token is not None):
-        example_input_ids = example_cell["input_ids "][0]
-        anchor_index = example_input_ids.index(anchor_token[0])
-        indices_to_perturb = [sorted([anchor_index,i]) if i!=anchor_index else None for i in range(example_cell["length"][0])]
-        indices_to_perturb = [item for item in indices_to_perturb if item is not None]
-    else:
-        example_input_ids = example_cell["input_ids"][0]
-        indices_to_perturb = [[example_input_ids.index(token)] if token in example_input_ids else None for token in tokens_to_perturb]
-        indices_to_perturb = [item for item in indices_to_perturb if item is not None]
-    # create all permutations of combo_lvl of modifiers from tokens_to_perturb
-    if combo_lvl>0 and (anchor_token is None):
-        if tokens_to_perturb != "all":
-            if len(tokens_to_perturb) == combo_lvl+1:
-                indices_to_perturb = [list(x) for x in it.combinations(indices_to_perturb, combo_lvl+1)]
-        else:
-            all_indices = [[i] for i in range(example_cell["length"][0])]
-            all_indices = [index for index in all_indices if index not in indices_to_perturb]
-            indices_to_perturb = [[[j for i in indices_to_perturb for j in i], x] for x in all_indices]
-    length = len(indices_to_perturb)
-    perturbation_dataset = Dataset.from_dict({"input_ids": example_cell["input_ids"]*length,
-                                              "perturb_index": indices_to_perturb})
-    if length<400:
-        num_proc_i = 1
-    else:
-        num_proc_i = num_proc
-    if perturb_type == "delete":
-        perturbation_dataset = perturbation_dataset.map(delete_indices, num_proc=num_proc_i)
-    elif perturb_type == "overexpress":
-        perturbation_dataset = perturbation_dataset.map(overexpress_indices, num_proc=num_proc_i)
-    return perturbation_dataset, indices_to_perturb
-# perturbed cell emb removing the activated/overexpressed/inhibited gene emb
-# so that only non-perturbed gene embeddings are compared to each other
-# in original or perturbed context
-def make_comparison_batch(original_emb_batch, indices_to_perturb, perturb_group):
-    all_embs_list = []
-    # if making comparison batch for multiple perturbations in single cell
-    if perturb_group == False:
-        original_emb_list = [original_emb_batch]*len(indices_to_perturb)
-    # if making comparison batch for single perturbation in multiple cells
-    elif perturb_group == True:
-        original_emb_list = original_emb_batch
-    for i in range(len(original_emb_list)):
-        original_emb = original_emb_list[i]
-        indices = indices_to_perturb[i]
-        if indices == [-100]:
-            all_embs_list += [original_emb[:]]
-            continue
-        emb_list = []
-        start = 0
-        if any(isinstance(el, list) for el in indices):
-            indices = flatten_list(indices)
-        for i in sorted(indices):
-            emb_list += [original_emb[start:i]]
-            start = i+1
-        emb_list += [original_emb[start:]]
-        all_embs_list += [torch.cat(emb_list)]
-    len_set = set([emb.size()[0] for emb in all_embs_list])
-    if len(len_set) > 1:
-        max_len = max(len_set)
-        all_embs_list = [pad_2d_tensor(emb, None, max_len, 0) for emb in all_embs_list]
-    return torch.stack(all_embs_list)
-# average embedding position of goal cell states
-def get_cell_state_avg_embs(model,
-                            filtered_input_data,
-                            cell_states_to_model,
-                            layer_to_quant,
-                            pad_token_id,
-                            forward_batch_size,
-                            num_proc):
-    model_input_size = get_model_input_size(model)
-    possible_states = get_possible_states(cell_states_to_model)
-    state_embs_dict = dict()
-    for possible_state in possible_states:
-        state_embs_list = []
-        original_lens = []
-        def filter_states(example):
-            state_key = cell_states_to_model["state_key"]
-            return example[state_key] in [possible_state]
-        filtered_input_data_state = filtered_input_data.filter(filter_states, num_proc=num_proc)
-        total_batch_length = len(filtered_input_data_state)
-        if ((total_batch_length-1)/forward_batch_size).is_integer():
-            forward_batch_size = forward_batch_size-1
-        max_len = max(filtered_input_data_state["length"])
-        for i in range(0, total_batch_length, forward_batch_size):
-            max_range = min(i+forward_batch_size, total_batch_length)
-            state_minibatch = filtered_input_data_state.select([i for i in range(i, max_range)])
-            state_minibatch.set_format(type="torch")
-            input_data_minibatch = state_minibatch["input_ids"]
-            original_lens += state_minibatch["length"]
-            input_data_minibatch = pad_tensor_list(input_data_minibatch,
-                                                   max_len,
-                                                   pad_token_id,
-                                                   model_input_size)
-            attention_mask = gen_attention_mask(state_minibatch, max_len)
-            with torch.no_grad():
-                outputs = model(
-                    input_ids = input_data_minibatch.to("cuda"),
-                    attention_mask = attention_mask
-                )
-            state_embs_i = outputs.hidden_states[layer_to_quant]
-            state_embs_list += [state_embs_i]
-            del outputs
-            del state_minibatch
-            del input_data_minibatch
-            del attention_mask
-            del state_embs_i
-            torch.cuda.empty_cache()
-        state_embs = torch.cat(state_embs_list)
-        avg_state_emb = mean_nonpadding_embs(state_embs, torch.Tensor(original_lens).to("cuda"))
-        avg_state_emb = torch.mean(avg_state_emb, dim=0, keepdim=True)
-        state_embs_dict[possible_state] = avg_state_emb
-    return state_embs_dict
-# quantify cosine similarity of perturbed vs original or alternate states
-def quant_cos_sims(model,
-                   perturb_type,
-                   perturbation_batch,
-                   forward_batch_size,
-                   layer_to_quant,
-                   original_emb,
-                   tokens_to_perturb,
-                   indices_to_perturb,
-                   perturb_group,
-                   cell_states_to_model,
-                   state_embs_dict,
-                   pad_token_id,
-                   model_input_size,
-                   nproc):
-    cos = torch.nn.CosineSimilarity(dim=2)
-    total_batch_length = len(perturbation_batch)
-    if ((total_batch_length-1)/forward_batch_size).is_integer():
-        forward_batch_size = forward_batch_size-1
-    if perturb_group == False:
-        comparison_batch = make_comparison_batch(original_emb, indices_to_perturb, perturb_group)
-    if cell_states_to_model is None:
-        cos_sims = []
-    else:
-        possible_states = get_possible_states(cell_states_to_model)
-        cos_sims_vs_alt_dict = dict(zip(possible_states,[[] for _ in range(len(possible_states))]))
-    # measure length of each element in perturbation_batch
-    perturbation_batch = perturbation_batch.map(
-            measure_length, num_proc=nproc
-        )
-    def compute_batch_embeddings(minibatch, _max_len = None):
-        minibatch_lengths = minibatch["length"]
-        minibatch_length_set = set(minibatch_lengths)
-        max_len = model_input_size
-        if (len(minibatch_length_set) > 1) or (max(minibatch_length_set) > max_len):
-            needs_pad_or_trunc = True
-        else:
-            needs_pad_or_trunc = False
-            max_len = max(minibatch_length_set)
-        if needs_pad_or_trunc == True:
-            if _max_len is None:
-                max_len = min(max(minibatch_length_set), max_len)
-            else:
-                max_len = _max_len
-            def pad_or_trunc_example(example):
-                example["input_ids"] = pad_or_truncate_encoding(example["input_ids"],
-                                                               pad_token_id,
-                                                               max_len)
-                return example
-            minibatch = minibatch.map(pad_or_trunc_example, num_proc=nproc)
-        minibatch.set_format(type="torch")
-        input_data_minibatch = minibatch["input_ids"]
-        attention_mask = gen_attention_mask(minibatch, max_len)
-        # extract embeddings for perturbation minibatch
-        with torch.no_grad():
-            outputs = model(
-                input_ids = input_data_minibatch.to("cuda"),
-                attention_mask = attention_mask
-            )
-        return outputs, max_len
-    for i in range(0, total_batch_length, forward_batch_size):
-        max_range = min(i+forward_batch_size, total_batch_length)
-        perturbation_minibatch = perturbation_batch.select([i for i in range(i, max_range)])
-        outputs, mini_max_len = compute_batch_embeddings(perturbation_minibatch)
-        if len(indices_to_perturb)>1:
-            minibatch_emb = torch.squeeze(outputs.hidden_states[layer_to_quant])
-        else:
-            minibatch_emb = outputs.hidden_states[layer_to_quant]
-        if perturb_type == "overexpress":
-            # remove overexpressed genes to quantify effect on remaining genes
-            if perturb_group == False:
-                overexpressed_to_remove = 1
-            if perturb_group == True:
-                overexpressed_to_remove = len(tokens_to_perturb)
-            minibatch_emb = minibatch_emb[:, overexpressed_to_remove: ,:]
-        # if quantifying single perturbation in multiple different cells, pad original batch and extract embs
-        if perturb_group == True:
-            # pad minibatch of original batch to extract embeddings
-            # truncate to the (model input size - # tokens to overexpress) to ensure comparability
-            # since max input size of perturb batch will be reduced by # tokens to overexpress
-            original_minibatch = original_emb.select([i for i in range(i, max_range)])
-            original_outputs, orig_max_len = compute_batch_embeddings(original_minibatch, mini_max_len)
-            if len(indices_to_perturb)>1:
-                original_minibatch_emb = torch.squeeze(original_outputs.hidden_states[layer_to_quant])
-            else:
-                original_minibatch_emb = original_outputs.hidden_states[layer_to_quant]
-            # if we overexpress genes that aren't already expressed,
-            # we need to remove genes to make sure the embeddings are of a consistent size
-            # get rid of the bottom n genes/padding since those will get truncated anyways
-            # multiple perturbations is more complicated because if 1 out of n perturbed genes is expressed
-            # the idxs will still not be [-100]
-            if len(tokens_to_perturb) == 1:
-                indices_to_perturb_minibatch = [idx if idx != [-100] else [orig_max_len - 1]
-                                                for idx in indices_to_perturb[i:max_range]]
-            else:
-                num_perturbed = len(tokens_to_perturb)
-                indices_to_perturb_minibatch = []
-                end_range = [i for i in range(orig_max_len - tokens_to_perturb, orig_max_len)]
-                for idx in indices_to_perturb[i:i+max_range]:
-                    if idx == [-100]:
-                        indices_to_perturb_minibatch.append(end_range)
-                    elif len(idx) < len(tokens_to_perturb):
-                        indices_to_perturb_minibatch.append(idx + end_range[-num_perturbed:])
-                    else:
-                        indices_to_perturb_minibatch.append(idx)
-            original_minibatch_emb = remove_indices_from_emb_batch(original_minibatch_emb,
-                                                                   indices_to_perturb_minibatch,
-                                                                   gene_dim=1)
-        # cosine similarity between original emb and batch items
-        if cell_states_to_model is None:
-            if perturb_group == False:
-                minibatch_comparison = comparison_batch[i:max_range]
-            elif perturb_group == True:
-                minibatch_comparison = original_minibatch_emb
-            cos_sims += [cos(minibatch_emb, minibatch_comparison).to("cpu")]
-        elif cell_states_to_model is not None:
-            if perturb_group == False:
-                original_emb = comparison_batch[i:max_range]
-            else:
-                original_minibatch_lengths = torch.tensor(original_minibatch["length"], device="cuda")
-                minibatch_lengths = torch.tensor(perturbation_minibatch["length"], device="cuda")
-            for state in possible_states:
-                if perturb_group == False:
-                    cos_sims_vs_alt_dict[state] += cos_sim_shift(original_emb,
-                                                                minibatch_emb,
-                                                                state_embs_dict[state],
-                                                                perturb_group)
-                elif perturb_group == True:
-                    cos_sims_vs_alt_dict[state] += cos_sim_shift(original_minibatch_emb,
-                                                                minibatch_emb,
-                                                                state_embs_dict[state],
-                                                                perturb_group,
-                                                                original_minibatch_lengths,
-                                                                minibatch_lengths)
-        del outputs
-        del minibatch_emb
-        if cell_states_to_model is None:
-            del minibatch_comparison
-        if perturb_group == True:
-            del original_minibatch_emb
-        torch.cuda.empty_cache()
-    if cell_states_to_model is None:
-        cos_sims_stack = torch.cat(cos_sims)
-        return cos_sims_stack
-    else:
-        for state in possible_states:
-            cos_sims_vs_alt_dict[state] = torch.cat(cos_sims_vs_alt_dict[state])
-        return cos_sims_vs_alt_dict
-# calculate cos sim shift of perturbation with respect to origin and alternative cell
-def cos_sim_shift(original_emb,
-                  minibatch_emb,
-                  end_emb,
-                  perturb_group,
-                  original_minibatch_lengths = None,
-                  minibatch_lengths = None):
-    cos = torch.nn.CosineSimilarity(dim=2)
-    if original_emb.size() != minibatch_emb.size():
-        logger.error(
-            f"Embeddings are not the same dimensions. " \
-            f"original_emb is {original_emb.size()}. " \
-            f"minibatch_emb is {minibatch_emb.size()}. "
-        )
-        raise
-    if not perturb_group:
-        original_emb = torch.mean(original_emb,dim=1,keepdim=True)
-        origin_v_end = torch.squeeze(cos(original_emb, end_emb))
-    else:
-        if original_minibatch_lengths is not None:
-            original_emb = mean_nonpadding_embs(original_emb, original_minibatch_lengths)
-        # else:
-        #     original_emb = torch.mean(original_emb,dim=1,keepdim=True)
-        end_emb = torch.unsqueeze(end_emb, 1)
-        origin_v_end = torch.squeeze(cos(original_emb, end_emb))
-    if minibatch_lengths is not None:
-        perturb_emb = mean_nonpadding_embs(minibatch_emb, minibatch_lengths)
-    else:
-        perturb_emb = torch.mean(minibatch_emb,dim=1,keepdim=True)
-    perturb_v_end = cos(perturb_emb, end_emb)
-    perturb_v_end = torch.squeeze(perturb_v_end)
-    if (perturb_v_end-origin_v_end).numel() == 1:
-        return [([perturb_v_end-origin_v_end]).to("cpu")]
-    return [(perturb_v_end-origin_v_end).to("cpu")]
-def pad_list(input_ids, pad_token_id, max_len):
-    input_ids = np.pad(input_ids,
-                       (0, max_len-len(input_ids)),
-                       mode='constant', constant_values=pad_token_id)
-    return input_ids
-def pad_tensor(tensor, pad_token_id, max_len):
-    tensor = torch.nn.functional.pad(tensor, pad=(0,
-                                     max_len - tensor.numel()),
-                                     mode='constant',
-                                     value=pad_token_id)
-    return tensor
-def pad_2d_tensor(tensor, pad_token_id, max_len, dim):
-    if dim == 0:
-        pad = (0, 0, 0, max_len - tensor.size()[dim])
-    elif dim == 1:
-        pad = (0, max_len - tensor.size()[dim], 0, 0)
-    tensor = torch.nn.functional.pad(tensor, pad=pad,
-                                     mode='constant',
-                                     value=pad_token_id)
-    return tensor
-def pad_or_truncate_encoding(encoding, pad_token_id, max_len):
-    if isinstance(encoding, torch.Tensor):
-        encoding_len = tensor.size()[0]
-    elif isinstance(encoding, list):
-        encoding_len = len(encoding)
-    if encoding_len > max_len:
-        encoding = encoding[0:max_len]
-    elif encoding_len < max_len:
-        if isinstance(encoding, torch.Tensor):
-            encoding = pad_tensor(encoding, pad_token_id, max_len)
-        elif isinstance(encoding, list):
-            encoding = pad_list(encoding, pad_token_id, max_len)
-    return encoding
-# pad list of tensors and convert to tensor
-def pad_tensor_list(tensor_list, dynamic_or_constant, pad_token_id, model_input_size):
-    # Determine maximum tensor length
-    if dynamic_or_constant == "dynamic":
-        max_len = max([tensor.squeeze().numel() for tensor in tensor_list])
-    elif type(dynamic_or_constant) == int:
-        max_len = dynamic_or_constant
-    else:
-        max_len = model_input_size
-        logger.warning(
-                    "If padding style is constant, must provide integer value. " \
-                    f"Setting padding to max input size {model_input_size}.")
-    # pad all tensors to maximum length
-    tensor_list = [pad_tensor(tensor, pad_token_id, max_len) for tensor in tensor_list]
-    # return stacked tensors
-    return torch.stack(tensor_list)
-def gen_attention_mask(minibatch_encoding, max_len = None):
-    if max_len == None:
-        max_len = max(minibatch_encoding["length"])
-    original_lens = minibatch_encoding["length"]
-    attention_mask = [[1]*original_len
-                      +[0]*(max_len - original_len)
-                      if original_len <= max_len
-                      else [1]*max_len
-                      for original_len in original_lens]
-    return torch.tensor(attention_mask).to("cuda")
-# get cell embeddings excluding padding
-def mean_nonpadding_embs(embs, original_lens):
-    # mask based on padding lengths
-    mask = torch.arange(embs.size(1)).unsqueeze(0).to("cuda") < original_lens.unsqueeze(1)
-    # extend mask dimensions to match the embeddings tensor
-    mask = mask.unsqueeze(2).expand_as(embs)
-    # use the mask to zero out the embeddings in padded areas
-    masked_embs = embs * mask.float()
-    # sum and divide by the lengths to get the mean of non-padding embs
-    mean_embs = masked_embs.sum(1) / original_lens.view(-1, 1).float()
-    return mean_embs
 class InSilicoPerturber:
     valid_option_dict = {
-        "perturb_type": {"delete","overexpress","inhibit","activate"},
         "perturb_rank_shift": {None, 1, 2, 3},
         "genes_to_perturb": {"all", list},
         "combos": {0, 1},
         "anchor_gene": {None, str},
-        "model_type": {"Pretrained","GeneClassifier","CellClassifier"},
         "num_classes": {int},
-        "emb_mode": {"cell","cell_and_gene"},
         "cell_emb_style": {"mean_pool"},
         "filter_data": {None, dict},
         "cell_states_to_model": {None, dict},
         "max_ncells": {None, int},
         "cell_inds_to_perturb": {"all", dict},
         "emb_layer": {-1, 0},
         "forward_batch_size": {int},
         "nproc": {int},
     }
     def __init__(
         self,
         perturb_type="delete",
@@ -629,6 +81,7 @@ class InSilicoPerturber:
         cell_emb_style="mean_pool",
         filter_data=None,
         cell_states_to_model=None,
         max_ncells=None,
         cell_inds_to_perturb="all",
         emb_layer=-1,
@@ -676,13 +129,14 @@ class InSilicoPerturber:
             For the pretrained Geneformer model, number of classes is 0 as it is not a classifier.
         emb_mode : {"cell","cell_and_gene"}
             Whether to output impact of perturbation on cell and/or gene embeddings.
         cell_emb_style : "mean_pool"
             Method for summarizing cell embeddings.
             Currently only option is mean pooling of gene embeddings for given cell.
         filter_data : None, dict
             Default is to use all input data for in silico perturbation study.
             Otherwise, dictionary specifying .dataset column name and list of values to filter by.
-        cell_states_to_model: None, dict
             Cell states to model if testing perturbations that achieve goal state change.
             Four-item dictionary with keys: state_key, start_state, goal_state, and alt_states
             state_key: key specifying name of column in .dataset that defines the start/goal states
@@ -693,6 +147,15 @@ class InSilicoPerturber:
                           "start_state": "dcm",
                           "goal_state": "nf",
                           "alt_states": ["hcm", "other1", "other2"]}
         max_ncells : None, int
             Maximum number of cells to test.
             If None, will test all cells.
@@ -705,8 +168,8 @@ class InSilicoPerturber:
             Useful for splitting extremely large datasets across separate GPUs.
         emb_layer : {-1, 0}
             Embedding layer to use for quantification.
-            -1: 2nd to last layer (recommended for pretrained Geneformer)
-            0: last layer (recommended for cell classifier fine-tuned for disease state)
         forward_batch_size : int
             Batch size for forward pass.
         nproc : int
@@ -721,23 +184,25 @@ class InSilicoPerturber:
         self.combos = combos
         self.anchor_gene = anchor_gene
         if self.genes_to_perturb == "all":
-            self.perturb_group = False
         else:
             self.perturb_group = True
-            if (self.anchor_gene != None) or (self.combos != 0):
                 self.anchor_gene = None
                 self.combos = 0
                 logger.warning(
-                    "anchor_gene set to None and combos set to 0. " \
-                    "If providing list of genes to perturb, " \
-                    "list of genes_to_perturb will be perturbed together, "\
-                    "without anchor gene or combinations.")
         self.model_type = model_type
         self.num_classes = num_classes
         self.emb_mode = emb_mode
         self.cell_emb_style = cell_emb_style
         self.filter_data = filter_data
         self.cell_states_to_model = cell_states_to_model
         self.max_ncells = max_ncells
         self.cell_inds_to_perturb = cell_inds_to_perturb
         self.emb_layer = emb_layer
@@ -758,36 +223,47 @@ class InSilicoPerturber:
             try:
                 self.anchor_token = [self.gene_token_dict[self.anchor_gene]]
             except KeyError:
-                logger.error(
-                    f"Anchor gene {self.anchor_gene} not in token dictionary."
-                )
                 raise
         if self.genes_to_perturb == "all":
             self.tokens_to_perturb = "all"
         else:
-            missing_genes = [gene for gene in self.genes_to_perturb if gene not in self.gene_token_dict.keys()]
             if len(missing_genes) == len(self.genes_to_perturb):
                 logger.error(
                     "None of the provided genes to perturb are in token dictionary."
                 )
                 raise
-            elif len(missing_genes)>0:
                 logger.warning(
-                    f"Genes to perturb {missing_genes} are not in token dictionary.")
-            self.tokens_to_perturb = [self.gene_token_dict.get(gene) for gene in self.genes_to_perturb]
     def validate_options(self):
         # first disallow options under development
         if self.perturb_type in ["inhibit", "activate"]:
             logger.error(
-                "In silico inhibition and activation currently under development. " \
                 "Current valid options for 'perturb_type': 'delete' or 'overexpress'"
             )
             raise
         # confirm arguments are within valid options and compatible with each other
-        for attr_name,valid_options in self.valid_option_dict.items():
             attr_value = self.__dict__[attr_name]
             if type(attr_value) not in {list, dict}:
                 if attr_value in valid_options:
@@ -797,141 +273,120 @@ class InSilicoPerturber:
                         continue
             valid_type = False
             for option in valid_options:
-                if (option in [int,list,dict]) and isinstance(attr_value, option):
                     valid_type = True
                     break
             if valid_type:
                 continue
             logger.error(
-                f"Invalid option for {attr_name}. " \
                 f"Valid options for {attr_name}: {valid_options}"
             )
             raise
-        if self.perturb_type in ["delete","overexpress"]:
             if self.perturb_rank_shift is not None:
                 if self.perturb_type == "delete":
                     logger.warning(
-                        "perturb_rank_shift set to None. " \
-                        "If perturb type is delete then gene is deleted entirely " \
-                        "rather than shifted by quartile")
                 elif self.perturb_type == "overexpress":
                     logger.warning(
-                        "perturb_rank_shift set to None. " \
-                        "If perturb type is overexpress then gene is moved to front " \
-                        "of rank value encoding rather than shifted by quartile")
             self.perturb_rank_shift = None
         if (self.anchor_gene is not None) and (self.emb_mode == "cell_and_gene"):
             self.emb_mode = "cell"
             logger.warning(
-                "emb_mode set to 'cell'. " \
-                "Currently, analysis with anchor gene " \
-                "only outputs effect on cell embeddings.")
         if self.cell_states_to_model is not None:
-            if len(self.cell_states_to_model.items()) == 1:
                 logger.warning(
-                    "The single value dictionary for cell_states_to_model will be " \
-                    "replaced with a dictionary with named keys for start, goal, and alternate states. " \
-                    "Please specify state_key, start_state, goal_state, and alt_states " \
-                    "in the cell_states_to_model dictionary for future use. " \
-                    "For example, cell_states_to_model={" \
-                            "'state_key': 'disease', " \
-                            "'start_state': 'dcm', " \
-                            "'goal_state': 'nf', " \
-                            "'alt_states': ['hcm', 'other1', 'other2']}"
                 )
-                for key,value in self.cell_states_to_model.items():
-                    if (len(value) == 3) and isinstance(value, tuple):
-                        if isinstance(value[0],list) and isinstance(value[1],list) and isinstance(value[2],list):
-                            if len(value[0]) == 1 and len(value[1]) == 1:
-                                all_values = value[0]+value[1]+value[2]
-                                if len(all_values) == len(set(all_values)):
-                                    continue
-                # reformat to the new named key format
-                state_values = flatten_list(list(self.cell_states_to_model.values()))
-                self.cell_states_to_model = {
-                    "state_key": list(self.cell_states_to_model.keys())[0],
-                    "start_state": state_values[0][0],
-                    "goal_state": state_values[1][0],
-                    "alt_states": state_values[2:][0]
-                }
-            elif set(self.cell_states_to_model.keys()) == {"state_key", "start_state", "goal_state", "alt_states"}:
-                if (self.cell_states_to_model["state_key"] is None) \
-                    or (self.cell_states_to_model["start_state"] is None) \
-                    or (self.cell_states_to_model["goal_state"] is None):
-                    logger.error(
-                        "Please specify 'state_key', 'start_state', and 'goal_state' in cell_states_to_model.")
-                    raise
-                if self.cell_states_to_model["start_state"] == self.cell_states_to_model["goal_state"]:
                     logger.error(
-                        "All states must be unique.")
                     raise
-                if self.cell_states_to_model["alt_states"] is not None:
-                    if type(self.cell_states_to_model["alt_states"]) is not list:
-                        logger.error(
-                            "self.cell_states_to_model['alt_states'] must be a list (even if it is one element)."
-                        )
-                        raise
-                    if len(self.cell_states_to_model["alt_states"])!= len(set(self.cell_states_to_model["alt_states"])):
-                        logger.error(
-                            "All states must be unique.")
-                        raise
-            else:
                 logger.error(
-                    "cell_states_to_model must only have the following four keys: " \
-                    "'state_key', 'start_state', 'goal_state', 'alt_states'." \
-                    "For example, cell_states_to_model={" \
-                            "'state_key': 'disease', " \
-                            "'start_state': 'dcm', " \
-                            "'goal_state': 'nf', " \
-                            "'alt_states': ['hcm', 'other1', 'other2']}"
                 )
                 raise
-            if self.anchor_gene is not None:
-                self.anchor_gene = None
-                logger.warning(
-                    "anchor_gene set to None. " \
-                    "Currently, anchor gene not available " \
-                    "when modeling multiple cell states.")
-        if self.perturb_type in ["inhibit","activate"]:
             if self.perturb_rank_shift is None:
                 logger.error(
-                    "If perturb_type is inhibit or activate then " \
-                    "quartile to shift by must be specified.")
                 raise
         if self.filter_data is not None:
-            for key,value in self.filter_data.items():
-                if type(value) != list:
                     self.filter_data[key] = [value]
                     logger.warning(
-                        "Values in filter_data dict must be lists. " \
-                        f"Changing {key} value to list ([{value}]).")
         if self.cell_inds_to_perturb != "all":
             if set(self.cell_inds_to_perturb.keys()) != {"start", "end"}:
                 logger.error(
                     "If cell_inds_to_perturb is a dictionary, keys must be 'start' and 'end'."
                 )
                 raise
-            if self.cell_inds_to_perturb["start"] < 0 or self.cell_inds_to_perturb["end"] < 0:
-                logger.error(
-                    'cell_inds_to_perturb must be positive.'
-                )
                 raise
-    def perturb_data(self,
-                     model_directory,
-                     input_data_file,
-                     output_directory,
-                     output_prefix):
         """
         Perturb genes in input data and save as results in output_directory.
@@ -947,365 +402,506 @@ class InSilicoPerturber:
             Prefix for output files
         """
-        filtered_input_data = load_and_filter(self.filter_data, self.nproc, input_data_file)
-        model = load_model(self.model_type, self.num_classes, model_directory)
-        layer_to_quant = quant_layers(model)+self.emb_layer
-        if self.cell_states_to_model is None:
-            state_embs_dict = None
         else:
-            # confirm that all states are valid to prevent futile filtering
-            state_name = self.cell_states_to_model["state_key"]
-            state_values = filtered_input_data[state_name]
-            for value in get_possible_states(self.cell_states_to_model):
-                if value not in state_values:
-                    logger.error(
-                        f"{value} is not present in the dataset's {state_name} attribute.")
-                    raise
-            # get dictionary of average cell state embeddings for comparison
-            downsampled_data = downsample_and_sort(filtered_input_data, self.max_ncells)
-            state_embs_dict = get_cell_state_avg_embs(model,
-                                                      downsampled_data,
-                                                      self.cell_states_to_model,
-                                                      layer_to_quant,
-                                                      self.pad_token_id,
-                                                      self.forward_batch_size,
-                                                      self.nproc)
-            # filter for start state cells
-            start_state = self.cell_states_to_model["start_state"]
-            def filter_for_origin(example):
-                return example[state_name] in [start_state]
-            filtered_input_data = filtered_input_data.filter(filter_for_origin, num_proc=self.nproc)
-        self.in_silico_perturb(model,
-                              filtered_input_data,
-                              layer_to_quant,
-                              state_embs_dict,
-                              output_directory,
-                              output_prefix)
-    # determine effect of perturbation on other genes
-    def in_silico_perturb(self,
-                          model,
-                          filtered_input_data,
-                          layer_to_quant,
-                          state_embs_dict,
-                          output_directory,
-                          output_prefix):
-        output_path_prefix = f"{output_directory}in_silico_{self.perturb_type}_{output_prefix}_dict_1Kbatch"
-        model_input_size = get_model_input_size(model)
-        # filter dataset for cells that have tokens to be perturbed
-        if self.anchor_token is not None:
-            def if_has_tokens_to_perturb(example):
-                return (len(set(example["input_ids"]).intersection(self.anchor_token))==len(self.anchor_token))
-            filtered_input_data = filtered_input_data.filter(if_has_tokens_to_perturb, num_proc=self.nproc)
-            if len(filtered_input_data) == 0:
-                logger.error(
-                        "No cells in dataset contain anchor gene.")
-                raise
-            else:
-                logger.info(f"# cells with anchor gene: {len(filtered_input_data)}")
         if (self.tokens_to_perturb != "all") and (self.perturb_type != "overexpress"):
-            # minimum # genes needed for perturbation test
-            min_genes = len(self.tokens_to_perturb)
-            def if_has_tokens_to_perturb(example):
-                return (len(set(example["input_ids"]).intersection(self.tokens_to_perturb))>=min_genes)
-            filtered_input_data = filtered_input_data.filter(if_has_tokens_to_perturb, num_proc=self.nproc)
-            if len(filtered_input_data) == 0:
-                logger.error(
-                        "No cells in dataset contain all genes to perturb as a group.")
-                raise
-        cos_sims_dict = defaultdict(list)
-        pickle_batch = -1
-        filtered_input_data = downsample_and_sort(filtered_input_data, self.max_ncells)
         if self.cell_inds_to_perturb != "all":
-            if self.cell_inds_to_perturb["start"] >= len(filtered_input_data):
-                logger.error("cell_inds_to_perturb['start'] is larger than the filtered dataset.")
-                raise
-            if self.cell_inds_to_perturb["end"] > len(filtered_input_data):
-                logger.warning("cell_inds_to_perturb['end'] is larger than the filtered dataset. \
-                               Setting to the end of the filtered dataset.")
-                self.cell_inds_to_perturb["end"] = len(filtered_input_data)
-            filtered_input_data = filtered_input_data.select([i for i in range(self.cell_inds_to_perturb["start"], self.cell_inds_to_perturb["end"])])
-        # make perturbation batch w/ single perturbation in multiple cells
-        if self.perturb_group == True:
-            def make_group_perturbation_batch(example):
-                example_input_ids = example["input_ids"]
-                example["tokens_to_perturb"] = self.tokens_to_perturb
-                indices_to_perturb = [example_input_ids.index(token) if token in example_input_ids else None for token in self.tokens_to_perturb]
-                indices_to_perturb = [item for item in indices_to_perturb if item is not None]
-                if len(indices_to_perturb) > 0:
-                    example["perturb_index"] = indices_to_perturb
                 else:
-                    # -100 indicates tokens to overexpress are not present in rank value encoding
-                    example["perturb_index"] = [-100]
-                if self.perturb_type == "delete":
-                    example = delete_indices(example)
-                elif self.perturb_type == "overexpress":
-                    example = overexpress_tokens(example)
-                return example
-            perturbation_batch = filtered_input_data.map(make_group_perturbation_batch, num_proc=self.nproc)
-            indices_to_perturb = perturbation_batch["perturb_index"]
-            cos_sims_data = quant_cos_sims(model,
-                                           self.perturb_type,
-                                           perturbation_batch,
-                                           self.forward_batch_size,
-                                           layer_to_quant,
-                                           filtered_input_data,
-                                           self.tokens_to_perturb,
-                                           indices_to_perturb,
-                                           self.perturb_group,
-                                           self.cell_states_to_model,
-                                           state_embs_dict,
-                                           self.pad_token_id,
-                                           model_input_size,
-                                           self.nproc)
-            perturbed_genes = tuple(self.tokens_to_perturb)
-            original_lengths = filtered_input_data["length"]
             if self.cell_states_to_model is None:
-                # update cos sims dict
-                # key is tuple of (perturbed_gene, affected_gene)
-                # or (perturbed_genes, "cell_emb") for avg cell emb change
-                cos_sims_data = cos_sims_data.to("cuda")
-                max_padded_len = cos_sims_data.shape[1]
-                for j in range(cos_sims_data.shape[0]):
-                    # remove padding before mean pooling cell embedding
-                    original_length = original_lengths[j]
-                    gene_list = filtered_input_data[j]["input_ids"]
-                    indices_removed = indices_to_perturb[j]
-                    padding_to_remove = max_padded_len - (original_length \
-                                                          - len(self.tokens_to_perturb) \
-                                                          - len(indices_removed))
-                    nonpadding_cos_sims_data = cos_sims_data[j][:-padding_to_remove]
-                    cell_cos_sim = torch.mean(nonpadding_cos_sims_data).item()
-                    cos_sims_dict[(perturbed_genes, "cell_emb")] += [cell_cos_sim]
-                    if self.emb_mode == "cell_and_gene":
-                        for k in range(cos_sims_data.shape[1]):
-                            cos_sim_value = nonpadding_cos_sims_data[k]
-                            affected_gene = gene_list[k].item()
-                            cos_sims_dict[(perturbed_genes, affected_gene)] += [cos_sim_value.item()]
             else:
-                # update cos sims dict
-                # key is tuple of (perturbed_genes, "cell_emb")
-                # value is list of tuples of cos sims for cell_states_to_model
-                origin_state_key = self.cell_states_to_model["start_state"]
-                cos_sims_origin = cos_sims_data[origin_state_key]
-                for j in range(cos_sims_origin.shape[0]):
-                    data_list = []
-                    for data in list(cos_sims_data.values()):
-                        data_item = data.to("cuda")
-                        data_list += [data_item[j].item()]
-                    cos_sims_dict[(perturbed_genes, "cell_emb")] += [tuple(data_list)]
-            with open(f"{output_path_prefix}_raw.pickle", "wb") as fp:
-                pickle.dump(cos_sims_dict, fp)
-        # make perturbation batch w/ multiple perturbations in single cell
-        if self.perturb_group == False:
-            for i in trange(len(filtered_input_data)):
-                example_cell = filtered_input_data.select([i])
-                original_emb = forward_pass_single_cell(model, example_cell, layer_to_quant)
-                gene_list = torch.squeeze(example_cell["input_ids"])
-                # reset to original type to prevent downstream issues due to forward_pass_single_cell modifying as torch format in place
-                example_cell = filtered_input_data.select([i])
-                if self.anchor_token is None:
-                    for combo_lvl in range(self.combos+1):
-                        perturbation_batch, indices_to_perturb = make_perturbation_batch(example_cell,
-                                                                                        self.perturb_type,
-                                                                                        self.tokens_to_perturb,
-                                                                                        self.anchor_token,
-                                                                                        combo_lvl,
-                                                                                        self.nproc)
-                        cos_sims_data = quant_cos_sims(model,
-                                                       self.perturb_type,
-                                                       perturbation_batch,
-                                                       self.forward_batch_size,
-                                                       layer_to_quant,
-                                                       original_emb,
-                                                       self.tokens_to_perturb,
-                                                       indices_to_perturb,
-                                                       self.perturb_group,
-                                                       self.cell_states_to_model,
-                                                       state_embs_dict,
-                                                       self.pad_token_id,
-                                                       model_input_size,
-                                                       self.nproc)
-                        if self.cell_states_to_model is None:
-                            # update cos sims dict
-                            # key is tuple of (perturbed_gene, affected_gene)
-                            # or (perturbed_gene, "cell_emb") for avg cell emb change
-                            cos_sims_data = cos_sims_data.to("cuda")
-                            for j in range(cos_sims_data.shape[0]):
-                                if self.tokens_to_perturb != "all":
-                                    j_index = torch.tensor(indices_to_perturb[j])
-                                    if j_index.shape[0]>1:
-                                        j_index = torch.squeeze(j_index)
-                                else:
-                                    j_index = torch.tensor([j])
-                                if self.perturb_type in ("overexpress", "activate"):
-                                    perturbed_gene = torch.index_select(gene_list, 0, j_index + 1)
-                                else:
-                                    perturbed_gene = torch.index_select(gene_list, 0, j_index)
-                                if perturbed_gene.shape[0]==1:
-                                    perturbed_gene = perturbed_gene.item()
-                                elif perturbed_gene.shape[0]>1:
-                                    perturbed_gene = tuple(perturbed_gene.tolist())
-                                cell_cos_sim = torch.mean(cos_sims_data[j]).item()
-                                cos_sims_dict[(perturbed_gene, "cell_emb")] += [cell_cos_sim]
-                                # not_j_index = list(set(i for i in range(gene_list.shape[0])).difference(j_index))
-                                # gene_list_j = torch.index_select(gene_list, 0, j_index)
-                                if self.emb_mode == "cell_and_gene":
-                                    for k in range(cos_sims_data.shape[1]):
-                                        cos_sim_value = cos_sims_data[j][k]
-                                        affected_gene = gene_list[k].item()
-                                        cos_sims_dict[(perturbed_gene, affected_gene)] += [cos_sim_value.item()]
-                        else:
-                            # update cos sims dict
-                            # key is tuple of (perturbed_gene, "cell_emb")
-                            # value is list of tuples of cos sims for cell_states_to_model
-                            origin_state_key = self.cell_states_to_model["start_state"]
-                            cos_sims_origin = cos_sims_data[origin_state_key]
-                            for j in range(cos_sims_origin.shape[0]):
-                                if (self.tokens_to_perturb != "all") or (combo_lvl>0):
-                                    j_index = torch.tensor(indices_to_perturb[j])
-                                    if j_index.shape[0]>1:
-                                        j_index = torch.squeeze(j_index)
-                                else:
-                                    j_index = torch.tensor([j])
-                                if self.perturb_type in ("overexpress", "activate"):
-                                    perturbed_gene = torch.index_select(gene_list, 0, j_index + 1)
-                                else:
-                                    perturbed_gene = torch.index_select(gene_list, 0, j_index)
-                                if perturbed_gene.shape[0]==1:
-                                    perturbed_gene = perturbed_gene.item()
-                                elif perturbed_gene.shape[0]>1:
-                                    perturbed_gene = tuple(perturbed_gene.tolist())
-                                data_list = []
-                                for data in list(cos_sims_data.values()):
-                                    data_item = data.to("cuda")
-                                    cell_data = torch.mean(data_item[j]).item()
-                                    data_list += [cell_data]
-                                cos_sims_dict[(perturbed_gene, "cell_emb")] += [tuple(data_list)]
-                elif self.anchor_token is not None:
-                    perturbation_batch, indices_to_perturb = make_perturbation_batch(example_cell,
-                                                                                     self.perturb_type,
-                                                                                     self.tokens_to_perturb,
-                                                                                     None,  # first run without anchor token to test individual gene perturbations
-                                                                                     0,
-                                                                                     self.nproc)
-                    cos_sims_data = quant_cos_sims(model,
-                                                   self.perturb_type,
-                                                   perturbation_batch,
-                                                   self.forward_batch_size,
-                                                   layer_to_quant,
-                                                   original_emb,
-                                                   self.tokens_to_perturb,
-                                                   indices_to_perturb,
-                                                   self.perturb_group,
-                                                   self.cell_states_to_model,
-                                                   state_embs_dict,
-                                                   self.pad_token_id,
-                                                   model_input_size,
-                                                   self.nproc)
-                    cos_sims_data = cos_sims_data.to("cuda")
-                    combo_perturbation_batch, combo_indices_to_perturb = make_perturbation_batch(example_cell,
-                                                                                                 self.perturb_type,
-                                                                                                 self.tokens_to_perturb,
-                                                                                                 self.anchor_token,
-                                                                                                 1,
-                                                                                                 self.nproc)
-                    combo_cos_sims_data = quant_cos_sims(model,
-                                                         self.perturb_type,
-                                                         combo_perturbation_batch,
-                                                         self.forward_batch_size,
-                                                         layer_to_quant,
-                                                         original_emb,
-                                                         self.tokens_to_perturb,
-                                                         combo_indices_to_perturb,
-                                                         self.perturb_group,
-                                                         self.cell_states_to_model,
-                                                         state_embs_dict,
-                                                         self.pad_token_id,
-                                                         model_input_size,
-                                                         self.nproc)
-                    combo_cos_sims_data = combo_cos_sims_data.to("cuda")
-                    # update cos sims dict
-                    # key is tuple of (perturbed_gene, "cell_emb") for avg cell emb change
-                    anchor_index = example_cell["input_ids"][0].index(self.anchor_token[0])
-                    anchor_cell_cos_sim = torch.mean(cos_sims_data[anchor_index]).item()
-                    non_anchor_indices = [k for k in range(cos_sims_data.shape[0]) if k != anchor_index]
-                    cos_sims_data = cos_sims_data[non_anchor_indices,:]
-                    for j in range(cos_sims_data.shape[0]):
-                        if j<anchor_index:
-                            j_index = torch.tensor([j])
-                        else:
-                            j_index = torch.tensor([j+1])
-                        perturbed_gene = torch.index_select(gene_list, 0, j_index)
-                        perturbed_gene = perturbed_gene.item()
-                        cell_cos_sim = torch.mean(cos_sims_data[j]).item()
-                        combo_cos_sim = torch.mean(combo_cos_sims_data[j]).item()
-                        cos_sims_dict[(perturbed_gene, "cell_emb")] += [(anchor_cell_cos_sim, # cos sim anchor gene alone
-                                                                         cell_cos_sim, # cos sim deleted gene alone
-                                                                         combo_cos_sim)] # cos sim anchor gene + deleted gene
-                # save dict to disk every 100 cells
-                if (i/100).is_integer():
-                    with open(f"{output_path_prefix}{pickle_batch}_raw.pickle", "wb") as fp:
-                        pickle.dump(cos_sims_dict, fp)
-                # reset and clear memory every 1000 cells
-                if (i/1000).is_integer():
-                    pickle_batch = pickle_batch+1
-                    # clear memory
-                    del perturbed_gene
-                    del cos_sims_data
-                    if self.cell_states_to_model is None:
-                        del cell_cos_sim
-                    if self.cell_states_to_model is not None:
-                        del cell_data
-                        del data_list
-                    elif self.anchor_token is None:
-                        if self.emb_mode == "cell_and_gene":
-                            del affected_gene
-                            del cos_sim_value
-                    else:
-                        del combo_cos_sim
-                        del combo_cos_sims_data
-                    # reset dict
-                    del cos_sims_dict
                     cos_sims_dict = defaultdict(list)
-                    torch.cuda.empty_cache()
-            # save remainder cells
-            with open(f"{output_path_prefix}{pickle_batch}_raw.pickle", "wb") as fp:
-                pickle.dump(cos_sims_dict, fp)

                           genes_to_perturb="all",
                           combos=0,
                           anchor_gene=None,
+                          model_type="CellClassifier",
                           num_classes=0,
                           emb_mode="cell",
                           cell_emb_style="mean_pool",
                           filter_data={"cell_type":["cardiomyocyte"]},
                           cell_states_to_model={"state_key": "disease", "start_state": "dcm", "goal_state": "nf", "alt_states": ["hcm", "other1", "other2"]},
+                          state_embs_dict ={"nf": emb_nf, "hcm": emb_hcm, "dcm": emb_dcm, "other1": emb_other1, "other2": emb_other2},
                           max_ncells=None,
+                          emb_layer=0,
                           forward_batch_size=100,
+                          nproc=16)
   isp.perturb_data("path/to/model",
                    "path/to/input_data",
                    "path/to/output_directory",
                    "output_prefix")
 """
 import logging
+# imports
+import os
 import pickle
 from collections import defaultdict
+import seaborn as sns
+import torch
+from datasets import Dataset
 from tqdm.auto import trange
+from . import perturber_utils as pu
+from .emb_extractor import get_embs
 from .tokenizer import TOKEN_DICTIONARY_FILE
+sns.set()
+logger = logging.getLogger(__name__)
 class InSilicoPerturber:
     valid_option_dict = {
+        "perturb_type": {"delete", "overexpress", "inhibit", "activate"},
         "perturb_rank_shift": {None, 1, 2, 3},
         "genes_to_perturb": {"all", list},
         "combos": {0, 1},
         "anchor_gene": {None, str},
+        "model_type": {"Pretrained", "GeneClassifier", "CellClassifier"},
         "num_classes": {int},
+        "emb_mode": {"cell", "cell_and_gene"},
         "cell_emb_style": {"mean_pool"},
         "filter_data": {None, dict},
         "cell_states_to_model": {None, dict},
+        "state_embs_dict": {None, dict},
         "max_ncells": {None, int},
         "cell_inds_to_perturb": {"all", dict},
         "emb_layer": {-1, 0},
         "forward_batch_size": {int},
         "nproc": {int},
     }
     def __init__(
         self,
         perturb_type="delete",
         cell_emb_style="mean_pool",
         filter_data=None,
         cell_states_to_model=None,
+        state_embs_dict=None,
         max_ncells=None,
         cell_inds_to_perturb="all",
         emb_layer=-1,
             For the pretrained Geneformer model, number of classes is 0 as it is not a classifier.
         emb_mode : {"cell","cell_and_gene"}
             Whether to output impact of perturbation on cell and/or gene embeddings.
+            Gene embedding shifts only available as compared to original cell, not comparing to goal state.
         cell_emb_style : "mean_pool"
             Method for summarizing cell embeddings.
             Currently only option is mean pooling of gene embeddings for given cell.
         filter_data : None, dict
             Default is to use all input data for in silico perturbation study.
             Otherwise, dictionary specifying .dataset column name and list of values to filter by.
+        cell_states_to_model : None, dict
             Cell states to model if testing perturbations that achieve goal state change.
             Four-item dictionary with keys: state_key, start_state, goal_state, and alt_states
             state_key: key specifying name of column in .dataset that defines the start/goal states
                           "start_state": "dcm",
                           "goal_state": "nf",
                           "alt_states": ["hcm", "other1", "other2"]}
+        state_embs_dict : None, dict
+            Embedding positions of each cell state to model shifts from/towards (e.g. mean or median).
+            Dictionary with keys specifying each possible cell state to model.
+            Values are target embedding positions as torch.tensor.
+            For example: {"nf": emb_nf,
+                          "hcm": emb_hcm,
+                          "dcm": emb_dcm,
+                          "other1": emb_other1,
+                          "other2": emb_other2}
         max_ncells : None, int
             Maximum number of cells to test.
             If None, will test all cells.
             Useful for splitting extremely large datasets across separate GPUs.
         emb_layer : {-1, 0}
             Embedding layer to use for quantification.
+            0: last layer (recommended for questions closely tied to model's training objective)
+            -1: 2nd to last layer (recommended for questions requiring more general representations)
         forward_batch_size : int
             Batch size for forward pass.
         nproc : int
         self.combos = combos
         self.anchor_gene = anchor_gene
         if self.genes_to_perturb == "all":
+            self.perturb_group = False
         else:
             self.perturb_group = True
+            if (self.anchor_gene is not None) or (self.combos != 0):
                 self.anchor_gene = None
                 self.combos = 0
                 logger.warning(
+                    "anchor_gene set to None and combos set to 0. "
+                    "If providing list of genes to perturb, "
+                    "list of genes_to_perturb will be perturbed together, "
+                    "without anchor gene or combinations."
+                )
         self.model_type = model_type
         self.num_classes = num_classes
         self.emb_mode = emb_mode
         self.cell_emb_style = cell_emb_style
         self.filter_data = filter_data
         self.cell_states_to_model = cell_states_to_model
+        self.state_embs_dict = state_embs_dict
         self.max_ncells = max_ncells
         self.cell_inds_to_perturb = cell_inds_to_perturb
         self.emb_layer = emb_layer
             try:
                 self.anchor_token = [self.gene_token_dict[self.anchor_gene]]
             except KeyError:
+                logger.error(f"Anchor gene {self.anchor_gene} not in token dictionary.")
                 raise
         if self.genes_to_perturb == "all":
             self.tokens_to_perturb = "all"
         else:
+            missing_genes = [
+                gene
+                for gene in self.genes_to_perturb
+                if gene not in self.gene_token_dict.keys()
+            ]
             if len(missing_genes) == len(self.genes_to_perturb):
                 logger.error(
                     "None of the provided genes to perturb are in token dictionary."
                 )
                 raise
+            elif len(missing_genes) > 0:
                 logger.warning(
+                    f"Genes to perturb {missing_genes} are not in token dictionary."
+                )
+            self.tokens_to_perturb = [
+                self.gene_token_dict.get(gene) for gene in self.genes_to_perturb
+            ]
     def validate_options(self):
         # first disallow options under development
         if self.perturb_type in ["inhibit", "activate"]:
             logger.error(
+                "In silico inhibition and activation currently under development. "
                 "Current valid options for 'perturb_type': 'delete' or 'overexpress'"
             )
             raise
+        if (self.combos > 0) and (self.anchor_token is None):
+            logger.error(
+                "Combination perturbation without anchor gene is currently under development. "
+                "Currently, must provide anchor gene for combination perturbation."
+            )
+            raise
         # confirm arguments are within valid options and compatible with each other
+        for attr_name, valid_options in self.valid_option_dict.items():
             attr_value = self.__dict__[attr_name]
             if type(attr_value) not in {list, dict}:
                 if attr_value in valid_options:
                         continue
             valid_type = False
             for option in valid_options:
+                if (option in [bool, int, list, dict]) and isinstance(
+                    attr_value, option
+                ):
                     valid_type = True
                     break
             if valid_type:
                 continue
             logger.error(
+                f"Invalid option for {attr_name}. "
                 f"Valid options for {attr_name}: {valid_options}"
             )
             raise
+        if self.perturb_type in ["delete", "overexpress"]:
             if self.perturb_rank_shift is not None:
                 if self.perturb_type == "delete":
                     logger.warning(
+                        "perturb_rank_shift set to None. "
+                        "If perturb type is delete then gene is deleted entirely "
+                        "rather than shifted by quartile"
+                    )
                 elif self.perturb_type == "overexpress":
                     logger.warning(
+                        "perturb_rank_shift set to None. "
+                        "If perturb type is overexpress then gene is moved to front "
+                        "of rank value encoding rather than shifted by quartile"
+                    )
             self.perturb_rank_shift = None
         if (self.anchor_gene is not None) and (self.emb_mode == "cell_and_gene"):
             self.emb_mode = "cell"
             logger.warning(
+                "emb_mode set to 'cell'. "
+                "Currently, analysis with anchor gene "
+                "only outputs effect on cell embeddings."
+            )
         if self.cell_states_to_model is not None:
+            pu.validate_cell_states_to_model(self.cell_states_to_model)
+            if self.anchor_gene is not None:
+                self.anchor_gene = None
                 logger.warning(
+                    "anchor_gene set to None. "
+                    "Currently, anchor gene not available "
+                    "when modeling multiple cell states."
                 )
+            if self.state_embs_dict is None:
+                logger.error(
+                    "state_embs_dict must be provided for mode with cell_states_to_model. "
+                    "Format is dictionary with keys specifying each possible cell state to model. "
+                    "Values are target embedding positions as torch.tensor."
+                )
+                raise
+            for state_emb in self.state_embs_dict.values():
+                if not torch.is_tensor(state_emb):
                     logger.error(
+                        "state_embs_dict must be dictionary with values being torch.tensor."
+                    )
                     raise
+            keys_absent = []
+            for k, v in self.cell_states_to_model.items():
+                if (k == "start_state") or (k == "goal_state"):
+                    if v not in self.state_embs_dict.keys():
+                        keys_absent.append(v)
+                if k == "alt_states":
+                    for state in v:
+                        if state not in self.state_embs_dict.keys():
+                            keys_absent.append(state)
+            if len(keys_absent) > 0:
                 logger.error(
+                    "Each start_state, goal_state, and alt_states in cell_states_to_model "
+                    "must be a key in state_embs_dict with the value being "
+                    "the state's embedding position as torch.tensor. "
+                    f"Missing keys: {keys_absent}"
                 )
                 raise
+        if self.perturb_type in ["inhibit", "activate"]:
             if self.perturb_rank_shift is None:
                 logger.error(
+                    "If perturb_type is inhibit or activate then "
+                    "quartile to shift by must be specified."
+                )
                 raise
         if self.filter_data is not None:
+            for key, value in self.filter_data.items():
+                if not isinstance(value, list):
                     self.filter_data[key] = [value]
                     logger.warning(
+                        "Values in filter_data dict must be lists. "
+                        f"Changing {key} value to list ([{value}])."
+                    )
         if self.cell_inds_to_perturb != "all":
             if set(self.cell_inds_to_perturb.keys()) != {"start", "end"}:
                 logger.error(
                     "If cell_inds_to_perturb is a dictionary, keys must be 'start' and 'end'."
                 )
                 raise
+            if (
+                self.cell_inds_to_perturb["start"] < 0
+                or self.cell_inds_to_perturb["end"] < 0
+            ):
+                logger.error("cell_inds_to_perturb must be positive.")
                 raise
+    def perturb_data(
+        self, model_directory, input_data_file, output_directory, output_prefix
+    ):
         """
         Perturb genes in input data and save as results in output_directory.
             Prefix for output files
         """
+        ### format output path ###
+        output_path_prefix = os.path.join(
+            output_directory, f"in_silico_{self.perturb_type}_{output_prefix}"
+        )
+        ### load model and define parameters ###
+        model = pu.load_model(self.model_type, self.num_classes, model_directory)
+        self.max_len = pu.get_model_input_size(model)
+        layer_to_quant = pu.quant_layers(model) + self.emb_layer
+        ### filter input data ###
+        # general filtering of input data based on filter_data argument
+        filtered_input_data = pu.load_and_filter(
+            self.filter_data, self.nproc, input_data_file
+        )
+        filtered_input_data = self.apply_additional_filters(filtered_input_data)
+        if self.perturb_group is True:
+            self.isp_perturb_set(
+                model, filtered_input_data, layer_to_quant, output_path_prefix
+            )
         else:
+            self.isp_perturb_all(
+                model, filtered_input_data, layer_to_quant, output_path_prefix
+            )
+    def apply_additional_filters(self, filtered_input_data):
+        # additional filtering of input data dependent on isp mode
+        if self.cell_states_to_model is not None:
+            # filter for cells with start_state and log result
+            filtered_input_data = pu.filter_data_by_start_state(
+                filtered_input_data, self.cell_states_to_model, self.nproc
+            )
         if (self.tokens_to_perturb != "all") and (self.perturb_type != "overexpress"):
+            # filter for cells with tokens_to_perturb and log result
+            filtered_input_data = pu.filter_data_by_tokens_and_log(
+                filtered_input_data,
+                self.tokens_to_perturb,
+                self.nproc,
+                "genes_to_perturb",
+            )
+        if self.anchor_token is not None:
+            # filter for cells with anchor gene and log result
+            filtered_input_data = pu.filter_data_by_tokens_and_log(
+                filtered_input_data, self.anchor_token, self.nproc, "anchor_gene"
+            )
+        # downsample and sort largest to smallest to encounter memory constraints earlier
+        filtered_input_data = pu.downsample_and_sort(
+            filtered_input_data, self.max_ncells
+        )
+        # slice dataset if cells_inds_to_perturb is not "all"
         if self.cell_inds_to_perturb != "all":
+            filtered_input_data = pu.slice_by_inds_to_perturb(
+                filtered_input_data, self.cell_inds_to_perturb
+            )
+        return filtered_input_data
+    def isp_perturb_set(
+        self,
+        model,
+        filtered_input_data: Dataset,
+        layer_to_quant: int,
+        output_path_prefix: str,
+    ):
+        def make_group_perturbation_batch(example):
+            example_input_ids = example["input_ids"]
+            example["tokens_to_perturb"] = self.tokens_to_perturb
+            indices_to_perturb = [
+                example_input_ids.index(token) if token in example_input_ids else None
+                for token in self.tokens_to_perturb
+            ]
+            indices_to_perturb = [
+                item for item in indices_to_perturb if item is not None
+            ]
+            if len(indices_to_perturb) > 0:
+                example["perturb_index"] = indices_to_perturb
+            else:
+                # -100 indicates tokens to overexpress are not present in rank value encoding
+                example["perturb_index"] = [-100]
+            if self.perturb_type == "delete":
+                example = pu.delete_indices(example)
+            elif self.perturb_type == "overexpress":
+                example = pu.overexpress_tokens(example, self.max_len)
+                example["n_overflow"] = pu.calc_n_overflow(
+                    self.max_len,
+                    example["length"],
+                    self.tokens_to_perturb,
+                    indices_to_perturb,
+                )
+            return example
+        total_batch_length = len(filtered_input_data)
+        if self.cell_states_to_model is None:
+            cos_sims_dict = defaultdict(list)
+        else:
+            cos_sims_dict = {
+                state: defaultdict(list)
+                for state in pu.get_possible_states(self.cell_states_to_model)
+            }
+        perturbed_data = filtered_input_data.map(
+            make_group_perturbation_batch, num_proc=self.nproc
+        )
+        if self.perturb_type == "overexpress":
+            filtered_input_data = filtered_input_data.add_column(
+                "n_overflow", perturbed_data["n_overflow"]
+            )
+            # remove overflow genes from original data so that embeddings are comparable
+            # i.e. if original cell has genes 0:2047 and you want to overexpress new gene 2048,
+            # then the perturbed cell will be 2048+0:2046 so we compare it to an original cell 0:2046.
+            # (otherwise we will be modeling the effect of both deleting 2047 and adding 2048,
+            # rather than only adding 2048)
+            filtered_input_data = filtered_input_data.map(
+                pu.truncate_by_n_overflow, num_proc=self.nproc
+            )
+        if self.emb_mode == "cell_and_gene":
+            stored_gene_embs_dict = defaultdict(list)
+        # iterate through batches
+        for i in trange(0, total_batch_length, self.forward_batch_size):
+            max_range = min(i + self.forward_batch_size, total_batch_length)
+            inds_select = [i for i in range(i, max_range)]
+            minibatch = filtered_input_data.select(inds_select)
+            perturbation_batch = perturbed_data.select(inds_select)
+            if self.cell_emb_style == "mean_pool":
+                full_original_emb = get_embs(
+                    model,
+                    minibatch,
+                    "gene",
+                    layer_to_quant,
+                    self.pad_token_id,
+                    self.forward_batch_size,
+                    summary_stat=None,
+                    silent=True,
+                )
+                indices_to_perturb = perturbation_batch["perturb_index"]
+                # remove indices that were perturbed
+                original_emb = pu.remove_perturbed_indices_set(
+                    full_original_emb,
+                    self.perturb_type,
+                    indices_to_perturb,
+                    self.tokens_to_perturb,
+                    minibatch["length"],
+                )
+                full_perturbation_emb = get_embs(
+                    model,
+                    perturbation_batch,
+                    "gene",
+                    layer_to_quant,
+                    self.pad_token_id,
+                    self.forward_batch_size,
+                    summary_stat=None,
+                    silent=True,
+                )
+                # remove overexpressed genes
+                if self.perturb_type == "overexpress":
+                    perturbation_emb = full_perturbation_emb[
+                        :, len(self.tokens_to_perturb) :, :
+                    ]
+                elif self.perturb_type == "delete":
+                    perturbation_emb = full_perturbation_emb[
+                        :, : max(perturbation_batch["length"]), :
+                    ]
+                n_perturbation_genes = perturbation_emb.size()[1]
+                # if no goal states, the cosine similarties are the mean of gene cosine similarities
+                if (
+                    self.cell_states_to_model is None
+                    or self.emb_mode == "cell_and_gene"
+                ):
+                    gene_cos_sims = pu.quant_cos_sims(
+                        perturbation_emb,
+                        original_emb,
+                        self.cell_states_to_model,
+                        self.state_embs_dict,
+                        emb_mode="gene",
+                    )
+                # if there are goal states, the cosine similarities are the cell cosine similarities
+                if self.cell_states_to_model is not None:
+                    original_cell_emb = pu.mean_nonpadding_embs(
+                        full_original_emb,
+                        torch.tensor(minibatch["length"], device="cuda"),
+                        dim=1,
+                    )
+                    perturbation_cell_emb = pu.mean_nonpadding_embs(
+                        full_perturbation_emb,
+                        torch.tensor(perturbation_batch["length"], device="cuda"),
+                        dim=1,
+                    )
+                    cell_cos_sims = pu.quant_cos_sims(
+                        perturbation_cell_emb,
+                        original_cell_emb,
+                        self.cell_states_to_model,
+                        self.state_embs_dict,
+                        emb_mode="cell",
+                    )
+                # get cosine similarities in gene embeddings
+                # if getting gene embeddings, need gene names
+                if self.emb_mode == "cell_and_gene":
+                    gene_list = minibatch["input_ids"]
+                    # need to truncate gene_list
+                    gene_list = [
+                        [g for g in genes if g not in self.tokens_to_perturb][
+                            :n_perturbation_genes
+                        ]
+                        for genes in gene_list
+                    ]
+                    for cell_i, genes in enumerate(gene_list):
+                        for gene_j, affected_gene in enumerate(genes):
+                            if len(self.genes_to_perturb) > 1:
+                                tokens_to_perturb = tuple(self.tokens_to_perturb)
+                            else:
+                                tokens_to_perturb = self.tokens_to_perturb
+                            # fill in the gene cosine similarities
+                            try:
+                                stored_gene_embs_dict[
+                                    (tokens_to_perturb, affected_gene)
+                                ].append(gene_cos_sims[cell_i, gene_j].item())
+                            except KeyError:
+                                stored_gene_embs_dict[
+                                    (tokens_to_perturb, affected_gene)
+                                ] = gene_cos_sims[cell_i, gene_j].item()
                 else:
+                    gene_list = None
+            if self.cell_states_to_model is None:
+                # calculate the mean of the gene cosine similarities for cell shift
+                # tensor of nonpadding lengths for each cell
+                if self.perturb_type == "overexpress":
+                    # subtract number of genes that were overexpressed
+                    # since they are removed before getting cos sims
+                    n_overexpressed = len(self.tokens_to_perturb)
+                    nonpadding_lens = [
+                        x - n_overexpressed for x in perturbation_batch["length"]
+                    ]
+                else:
+                    nonpadding_lens = perturbation_batch["length"]
+                cos_sims_data = pu.mean_nonpadding_embs(
+                    gene_cos_sims, torch.tensor(nonpadding_lens, device="cuda")
+                )
+                cos_sims_dict = self.update_perturbation_dictionary(
+                    cos_sims_dict,
+                    cos_sims_data,
+                    filtered_input_data,
+                    indices_to_perturb,
+                    gene_list,
+                )
+            else:
+                cos_sims_data = cell_cos_sims
+                for state in cos_sims_dict.keys():
+                    cos_sims_dict[state] = self.update_perturbation_dictionary(
+                        cos_sims_dict[state],
+                        cos_sims_data[state],
+                        filtered_input_data,
+                        indices_to_perturb,
+                        gene_list,
+                    )
+            del minibatch
+            del perturbation_batch
+            del original_emb
+            del perturbation_emb
+            del cos_sims_data
+            torch.cuda.empty_cache()
+        pu.write_perturbation_dictionary(
+            cos_sims_dict,
+            f"{output_path_prefix}_cell_embs_dict_{self.tokens_to_perturb}",
+        )
+        if self.emb_mode == "cell_and_gene":
+            pu.write_perturbation_dictionary(
+                stored_gene_embs_dict,
+                f"{output_path_prefix}_gene_embs_dict_{self.tokens_to_perturb}",
+            )
+    def isp_perturb_all(
+        self,
+        model,
+        filtered_input_data: Dataset,
+        layer_to_quant: int,
+        output_path_prefix: str,
+    ):
+        pickle_batch = -1
+        if self.cell_states_to_model is None:
+            cos_sims_dict = defaultdict(list)
+        else:
+            cos_sims_dict = {
+                state: defaultdict(list)
+                for state in pu.get_possible_states(self.cell_states_to_model)
+            }
+        if self.emb_mode == "cell_and_gene":
+            stored_gene_embs_dict = defaultdict(list)
+        for i in trange(len(filtered_input_data)):
+            example_cell = filtered_input_data.select([i])
+            full_original_emb = get_embs(
+                model,
+                example_cell,
+                "gene",
+                layer_to_quant,
+                self.pad_token_id,
+                self.forward_batch_size,
+                summary_stat=None,
+                silent=True,
+            )
+            # gene_list is used to assign cos sims back to genes
+            # need to remove the anchor gene
+            gene_list = example_cell["input_ids"][0][:]
+            if self.anchor_token is not None:
+                for token in self.anchor_token:
+                    gene_list.remove(token)
+            perturbation_batch, indices_to_perturb = pu.make_perturbation_batch(
+                example_cell,
+                self.perturb_type,
+                self.tokens_to_perturb,
+                self.anchor_token,
+                self.combos,
+                self.nproc,
+            )
+            full_perturbation_emb = get_embs(
+                model,
+                perturbation_batch,
+                "gene",
+                layer_to_quant,
+                self.pad_token_id,
+                self.forward_batch_size,
+                summary_stat=None,
+                silent=True,
+            )
+            num_inds_perturbed = 1 + self.combos
+            # need to remove overexpressed gene to quantify cosine shifts
+            if self.perturb_type == "overexpress":
+                perturbation_emb = full_perturbation_emb[:, num_inds_perturbed:, :]
+                gene_list = gene_list[
+                    num_inds_perturbed:
+                ]  # index 0 is not overexpressed
+            elif self.perturb_type == "delete":
+                perturbation_emb = full_perturbation_emb
+            original_batch = pu.make_comparison_batch(
+                full_original_emb, indices_to_perturb, perturb_group=False
+            )
+            if self.cell_states_to_model is None or self.emb_mode == "cell_and_gene":
+                gene_cos_sims = pu.quant_cos_sims(
+                    perturbation_emb,
+                    original_batch,
+                    self.cell_states_to_model,
+                    self.state_embs_dict,
+                    emb_mode="gene",
+                )
+            if self.cell_states_to_model is not None:
+                original_cell_emb = pu.compute_nonpadded_cell_embedding(
+                    full_original_emb, "mean_pool"
+                )
+                perturbation_cell_emb = pu.compute_nonpadded_cell_embedding(
+                    full_perturbation_emb, "mean_pool"
+                )
+                cell_cos_sims = pu.quant_cos_sims(
+                    perturbation_cell_emb,
+                    original_cell_emb,
+                    self.cell_states_to_model,
+                    self.state_embs_dict,
+                    emb_mode="cell",
+                )
+            if self.emb_mode == "cell_and_gene":
+                # remove perturbed index for gene list
+                perturbed_gene_dict = {
+                    gene: gene_list[:i] + gene_list[i + 1 :]
+                    for i, gene in enumerate(gene_list)
+                }
+                for perturbation_i, perturbed_gene in enumerate(gene_list):
+                    for gene_j, affected_gene in enumerate(
+                        perturbed_gene_dict[perturbed_gene]
+                    ):
+                        try:
+                            stored_gene_embs_dict[
+                                (perturbed_gene, affected_gene)
+                            ].append(gene_cos_sims[perturbation_i, gene_j].item())
+                        except KeyError:
+                            stored_gene_embs_dict[
+                                (perturbed_gene, affected_gene)
+                            ] = gene_cos_sims[perturbation_i, gene_j].item()
             if self.cell_states_to_model is None:
+                cos_sims_data = torch.mean(gene_cos_sims, dim=1)
+                cos_sims_dict = self.update_perturbation_dictionary(
+                    cos_sims_dict,
+                    cos_sims_data,
+                    filtered_input_data,
+                    indices_to_perturb,
+                    gene_list,
+                )
             else:
+                cos_sims_data = cell_cos_sims
+                for state in cos_sims_dict.keys():
+                    cos_sims_dict[state] = self.update_perturbation_dictionary(
+                        cos_sims_dict[state],
+                        cos_sims_data[state],
+                        filtered_input_data,
+                        indices_to_perturb,
+                        gene_list,
+                    )
+            # save dict to disk every 100 cells
+            if i % 100 == 0:
+                pu.write_perturbation_dictionary(
+                    cos_sims_dict,
+                    f"{output_path_prefix}_dict_cell_embs_1Kbatch{pickle_batch}",
+                )
+                if self.emb_mode == "cell_and_gene":
+                    pu.write_perturbation_dictionary(
+                        stored_gene_embs_dict,
+                        f"{output_path_prefix}_dict_gene_embs_1Kbatch{pickle_batch}",
+                    )
+            # reset and clear memory every 1000 cells
+            if i % 1000 == 0:
+                pickle_batch += 1
+                if self.cell_states_to_model is None:
                     cos_sims_dict = defaultdict(list)
+                else:
+                    cos_sims_dict = {
+                        state: defaultdict(list)
+                        for state in pu.get_possible_states(self.cell_states_to_model)
+                    }
+                if self.emb_mode == "cell_and_gene":
+                    stored_gene_embs_dict = defaultdict(list)
+                torch.cuda.empty_cache()
+        pu.write_perturbation_dictionary(
+            cos_sims_dict, f"{output_path_prefix}_dict_cell_embs_1Kbatch{pickle_batch}"
+        )
+        if self.emb_mode == "cell_and_gene":
+            pu.write_perturbation_dictionary(
+                stored_gene_embs_dict,
+                f"{output_path_prefix}_dict_gene_embs_1Kbatch{pickle_batch}",
+            )
+    def update_perturbation_dictionary(
+        self,
+        cos_sims_dict: defaultdict,
+        cos_sims_data: torch.Tensor,
+        filtered_input_data: Dataset,
+        indices_to_perturb: list[list[int]],
+        gene_list=None,
+    ):
+        if gene_list is not None and cos_sims_data.shape[0] != len(gene_list):
+            logger.error(
+                f"len(cos_sims_data.shape[0]) != len(gene_list). \n \
+                            cos_sims_data.shape[0] = {cos_sims_data.shape[0]}.\n \
+                            len(gene_list) = {len(gene_list)}."
+            )
+            raise
+        if self.perturb_group is True:
+            if len(self.tokens_to_perturb) > 1:
+                perturbed_genes = tuple(self.tokens_to_perturb)
+            else:
+                perturbed_genes = self.tokens_to_perturb[0]
+            # if cell embeddings, can just append
+            # shape will be (batch size, 1)
+            cos_sims_data = torch.squeeze(cos_sims_data).tolist()
+            # handle case of single cell left
+            if not isinstance(cos_sims_data, list):
+                cos_sims_data = [cos_sims_data]
+            cos_sims_dict[(perturbed_genes, "cell_emb")] += cos_sims_data
+        else:
+            for i, cos in enumerate(cos_sims_data.tolist()):
+                cos_sims_dict[(gene_list[i], "cell_emb")].append(cos)
+        return cos_sims_dict

geneformer/in_silico_perturber_stats.py CHANGED Viewed

@@ -6,9 +6,9 @@ Usage:
   ispstats = InSilicoPerturberStats(mode="goal_state_shift",
                                     combos=0,
                                     anchor_gene=None,
-                                    cell_states_to_model={"state_key": "disease",
-                                                          "start_state": "dcm",
-                                                          "goal_state": "nf",
                                                           "alt_states": ["hcm", "other1", "other2"]})
   ispstats.get_stats("path/to/input_data",
                      None,
@@ -17,88 +17,157 @@ Usage:
 """
-import os
 import logging
-import numpy as np
-import pandas as pd
 import pickle
 import random
-import statsmodels.stats.multitest as smt
 from pathlib import Path
 from scipy.stats import ranksums
 from sklearn.mixture import GaussianMixture
-from tqdm.auto import trange, tqdm
-from .in_silico_perturber import flatten_list
 from .tokenizer import TOKEN_DICTIONARY_FILE
 GENE_NAME_ID_DICTIONARY_FILE = Path(__file__).parent / "gene_name_id_dict.pkl"
 logger = logging.getLogger(__name__)
 # invert dictionary keys/values
 def invert_dict(dictionary):
     return {v: k for k, v in dictionary.items()}
 # read raw dictionary files
-def read_dictionaries(input_data_directory, cell_or_gene_emb, anchor_token):
-    file_found = 0
     file_path_list = []
-    dict_list = []
     for file in os.listdir(input_data_directory):
-        # process only _raw.pickle files
-        if file.endswith("_raw.pickle"):
-            file_found = 1
             file_path_list += [f"{input_data_directory}/{file}"]
     for file_path in tqdm(file_path_list):
         with open(file_path, "rb") as fp:
             cos_sims_dict = pickle.load(fp)
-            if cell_or_gene_emb == "cell":
-                cell_emb_dict = {k: v for k,
-                                v in cos_sims_dict.items() if v and "cell_emb" in k}
-                dict_list += [cell_emb_dict]
-            elif cell_or_gene_emb == "gene":
-                gene_emb_dict = {k: v for k,
-                                v in cos_sims_dict.items() if v and anchor_token == k[0]}
-                dict_list += [gene_emb_dict]
-    if file_found == 0:
         logger.error(
-                    "No raw data for processing found within provided directory. " \
-                    "Please ensure data files end with '_raw.pickle'.")
         raise
-    return dict_list
 # get complete gene list
-def get_gene_list(dict_list,mode):
     if mode == "cell":
         position = 0
     elif mode == "gene":
         position = 1
     gene_set = set()
-    for dict_i in dict_list:
-        gene_set.update([k[position] for k, v in dict_i.items() if v])
     gene_list = list(gene_set)
     if mode == "gene":
         gene_list.remove("cell_emb")
     gene_list.sort()
     return gene_list
 def token_tuple_to_ensembl_ids(token_tuple, gene_token_id_dict):
-    return tuple([gene_token_id_dict.get(i, np.nan) for i in token_tuple])
 def n_detections(token, dict_list, mode, anchor_token):
     cos_sim_megalist = []
     for dict_i in dict_list:
         if mode == "cell":
-            cos_sim_megalist += dict_i.get((token, "cell_emb"),[])
         elif mode == "gene":
-            cos_sim_megalist += dict_i.get((anchor_token, token),[])
     return len(cos_sim_megalist)
 def get_fdr(pvalues):
     return list(smt.multipletests(pvalues, alpha=0.05, method="fdr_bh")[1])
 def get_impact_component(test_value, gaussian_mixture_model):
     impact_border = gaussian_mixture_model.means_[0][0]
     nonimpact_border = gaussian_mixture_model.means_[1][0]
@@ -114,236 +183,356 @@ def get_impact_component(test_value, gaussian_mixture_model):
             impact_component = 1
     return impact_component
 # aggregate data for single perturbation in multiple cells
-def isp_aggregate_grouped_perturb(cos_sims_df, dict_list):
-    names=["Cosine_shift"]
     cos_sims_full_df = pd.DataFrame(columns=names)
     cos_shift_data = []
     token = cos_sims_df["Gene"][0]
     for dict_i in dict_list:
-        cos_shift_data += dict_i.get((token, "cell_emb"),[])
     cos_sims_full_df["Cosine_shift"] = cos_shift_data
-    return cos_sims_full_df
 # stats comparing cos sim shifts towards goal state of test perturbations vs random perturbations
-def isp_stats_to_goal_state(cos_sims_df, dict_list, cell_states_to_model, genes_perturbed):
-    cell_state_key = cell_states_to_model["start_state"]
-    if ("alt_states" not in cell_states_to_model.keys()) \
-        or (len(cell_states_to_model["alt_states"]) == 0) \
-        or (cell_states_to_model["alt_states"] == [None]):
         alt_end_state_exists = False
-    elif (len(cell_states_to_model["alt_states"]) > 0) and (cell_states_to_model["alt_states"] != [None]):
         alt_end_state_exists = True
     # for single perturbation in multiple cells, there are no random perturbations to compare to
     if genes_perturbed != "all":
-        names=["Shift_to_goal_end",
-               "Shift_to_alt_end"]
-        if alt_end_state_exists == False:
-            names.remove("Shift_to_alt_end")
-        cos_sims_full_df = pd.DataFrame(columns=names)
-        cos_shift_data = []
         token = cos_sims_df["Gene"][0]
-        for dict_i in dict_list:
-            cos_shift_data += dict_i.get((token, "cell_emb"),[])
-        if alt_end_state_exists == False:
-            cos_sims_full_df["Shift_to_goal_end"] = [goal_end for start_state,goal_end in cos_shift_data]
-        if alt_end_state_exists == True:
-            cos_sims_full_df["Shift_to_goal_end"] = [goal_end for start_state,goal_end,alt_end in cos_shift_data]
-            cos_sims_full_df["Shift_to_alt_end"] = [alt_end for start_state,goal_end,alt_end in cos_shift_data]
         # sort by shift to desired state
-        cos_sims_full_df = cos_sims_full_df.sort_values(by=["Shift_to_goal_end"],
-                                                            ascending=[False])
-        return cos_sims_full_df
     elif genes_perturbed == "all":
-        random_tuples = []
         for i in trange(cos_sims_df.shape[0]):
             token = cos_sims_df["Gene"][i]
-            for dict_i in dict_list:
-                random_tuples += dict_i.get((token, "cell_emb"),[])
-        if alt_end_state_exists == False:
-            goal_end_random_megalist = [goal_end for start_state,goal_end in random_tuples]
-        elif alt_end_state_exists == True:
-            goal_end_random_megalist = [goal_end for start_state,goal_end,alt_end in random_tuples]
-            alt_end_random_megalist = [alt_end for start_state,goal_end,alt_end in random_tuples]
         # downsample to improve speed of ranksums
         if len(goal_end_random_megalist) > 100_000:
             random.seed(42)
-            goal_end_random_megalist = random.sample(goal_end_random_megalist, k=100_000)
-        if alt_end_state_exists == True:
-            if len(alt_end_random_megalist) > 100_000:
-                random.seed(42)
-                alt_end_random_megalist = random.sample(alt_end_random_megalist, k=100_000)
-        names=["Gene",
-               "Gene_name",
-               "Ensembl_ID",
-               "Shift_to_goal_end",
-               "Shift_to_alt_end",
-               "Goal_end_vs_random_pval",
-               "Alt_end_vs_random_pval"]
-        if alt_end_state_exists == False:
-            names.remove("Shift_to_alt_end")
-            names.remove("Alt_end_vs_random_pval")
         cos_sims_full_df = pd.DataFrame(columns=names)
         for i in trange(cos_sims_df.shape[0]):
             token = cos_sims_df["Gene"][i]
             name = cos_sims_df["Gene_name"][i]
             ensembl_id = cos_sims_df["Ensembl_ID"][i]
-            cos_shift_data = []
-            for dict_i in dict_list:
-                cos_shift_data += dict_i.get((token, "cell_emb"),[])
-            if alt_end_state_exists == False:
-                goal_end_cos_sim_megalist = [goal_end for start_state,goal_end in cos_shift_data]
-            elif alt_end_state_exists == True:
-                goal_end_cos_sim_megalist = [goal_end for start_state,goal_end,alt_end in cos_shift_data]
-                alt_end_cos_sim_megalist = [alt_end for start_state,goal_end,alt_end in cos_shift_data]
-                mean_alt_end = np.mean(alt_end_cos_sim_megalist)
-                pval_alt_end = ranksums(alt_end_random_megalist,alt_end_cos_sim_megalist).pvalue
-            mean_goal_end = np.mean(goal_end_cos_sim_megalist)
-            pval_goal_end = ranksums(goal_end_random_megalist,goal_end_cos_sim_megalist).pvalue
-            if alt_end_state_exists == False:
-                data_i = [token,
-                          name,
-                          ensembl_id,
-                          mean_goal_end,
-                          pval_goal_end]
-            elif alt_end_state_exists == True:
-                data_i = [token,
-                          name,
-                          ensembl_id,
-                          mean_goal_end,
-                          mean_alt_end,
-                          pval_goal_end,
-                          pval_alt_end]
-            cos_sims_df_i = pd.DataFrame(dict(zip(names,data_i)),index=[i])
-            cos_sims_full_df = pd.concat([cos_sims_full_df,cos_sims_df_i])
-        cos_sims_full_df["Goal_end_FDR"] = get_fdr(list(cos_sims_full_df["Goal_end_vs_random_pval"]))
-        if alt_end_state_exists == True:
-            cos_sims_full_df["Alt_end_FDR"] = get_fdr(list(cos_sims_full_df["Alt_end_vs_random_pval"]))
         # quantify number of detections of each gene
-        cos_sims_full_df["N_Detections"] = [n_detections(i, dict_list, "cell", None) for i in cos_sims_full_df["Gene"]]
-        # sort by shift to desired state\
-        cos_sims_full_df["Sig"] = [1 if fdr<0.05 else 0 for fdr in cos_sims_full_df["Goal_end_FDR"]]
-        cos_sims_full_df = cos_sims_full_df.sort_values(by=["Sig",
-                                                            "Shift_to_goal_end",
-                                                            "Goal_end_FDR"],
-                                                            ascending=[False,False,True])
         return cos_sims_full_df
 # stats comparing cos sim shifts of test perturbations vs null distribution
 def isp_stats_vs_null(cos_sims_df, dict_list, null_dict_list):
     cos_sims_full_df = cos_sims_df.copy()
     cos_sims_full_df["Test_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
     cos_sims_full_df["Null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
-    cos_sims_full_df["Test_vs_null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
     cos_sims_full_df["Test_vs_null_pval"] = np.zeros(cos_sims_df.shape[0], dtype=float)
     cos_sims_full_df["Test_vs_null_FDR"] = np.zeros(cos_sims_df.shape[0], dtype=float)
-    cos_sims_full_df["N_Detections_test"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
-    cos_sims_full_df["N_Detections_null"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
     for i in trange(cos_sims_df.shape[0]):
         token = cos_sims_df["Gene"][i]
         test_shifts = []
         null_shifts = []
         for dict_i in dict_list:
-            test_shifts += dict_i.get((token, "cell_emb"),[])
         for dict_i in null_dict_list:
-            null_shifts += dict_i.get((token, "cell_emb"),[])
         cos_sims_full_df.loc[i, "Test_avg_shift"] = np.mean(test_shifts)
         cos_sims_full_df.loc[i, "Null_avg_shift"] = np.mean(null_shifts)
-        cos_sims_full_df.loc[i, "Test_vs_null_avg_shift"] = np.mean(test_shifts)-np.mean(null_shifts)
-        cos_sims_full_df.loc[i, "Test_vs_null_pval"] = ranksums(test_shifts,
-            null_shifts, nan_policy="omit").pvalue
         cos_sims_full_df.loc[i, "N_Detections_test"] = len(test_shifts)
         cos_sims_full_df.loc[i, "N_Detections_null"] = len(null_shifts)
-    cos_sims_full_df["Test_vs_null_FDR"] = get_fdr(cos_sims_full_df["Test_vs_null_pval"])
-    cos_sims_full_df["Sig"] = [1 if fdr<0.05 else 0 for fdr in cos_sims_full_df["Test_vs_null_FDR"]]
-    cos_sims_full_df = cos_sims_full_df.sort_values(by=["Sig",
-                                                        "Test_vs_null_avg_shift",
-                                                        "Test_vs_null_FDR"],
-                                                        ascending=[False,False,True])
     return cos_sims_full_df
 # stats for identifying perturbations with largest effect within a given set of cells
 # fits a mixture model to 2 components (impact vs. non-impact) and
 # reports the most likely component for each test perturbation
 # Note: because assumes given perturbation has a consistent effect in the cells tested,
 # we recommend only using the mixture model strategy with uniform cell populations
 def isp_stats_mixture_model(cos_sims_df, dict_list, combos, anchor_token):
-    names=["Gene",
-           "Gene_name",
-           "Ensembl_ID"]
     if combos == 0:
         names += ["Test_avg_shift"]
     elif combos == 1:
-        names += ["Anchor_shift",
-                  "Test_token_shift",
-                  "Sum_of_indiv_shifts",
-                  "Combo_shift",
-                  "Combo_minus_sum_shift"]
-    names += ["Impact_component",
-              "Impact_component_percent"]
     cos_sims_full_df = pd.DataFrame(columns=names)
     avg_values = []
     gene_names = []
     for i in trange(cos_sims_df.shape[0]):
         token = cos_sims_df["Gene"][i]
         name = cos_sims_df["Gene_name"][i]
         ensembl_id = cos_sims_df["Ensembl_ID"][i]
         cos_shift_data = []
         for dict_i in dict_list:
             if (combos == 0) and (anchor_token is not None):
-                cos_shift_data += dict_i.get((anchor_token, token),[])
             else:
-                cos_shift_data += dict_i.get((token, "cell_emb"),[])
         # Extract values for current gene
         if combos == 0:
             test_values = cos_shift_data
         elif combos == 1:
             test_values = []
             for tup in cos_shift_data:
-                test_values.append(tup[2])
         if len(test_values) > 0:
             avg_value = np.mean(test_values)
             avg_values.append(avg_value)
             gene_names.append(name)
     # fit Gaussian mixture model to dataset of mean for each gene
     avg_values_to_fit = np.array(avg_values).reshape(-1, 1)
     gm = GaussianMixture(n_components=2, random_state=0).fit(avg_values_to_fit)
     for i in trange(cos_sims_df.shape[0]):
         token = cos_sims_df["Gene"][i]
         name = cos_sims_df["Gene_name"][i]
@@ -352,71 +541,95 @@ def isp_stats_mixture_model(cos_sims_df, dict_list, combos, anchor_token):
         for dict_i in dict_list:
             if (combos == 0) and (anchor_token is not None):
-                cos_shift_data += dict_i.get((anchor_token, token),[])
             else:
-                cos_shift_data += dict_i.get((token, "cell_emb"),[])
         if combos == 0:
             mean_test = np.mean(cos_shift_data)
-            impact_components = [get_impact_component(value,gm) for value in cos_shift_data]
         elif combos == 1:
-            anchor_cos_sim_megalist = [anchor for anchor,token,combo in cos_shift_data]
-            token_cos_sim_megalist = [token for anchor,token,combo in cos_shift_data]
-            anchor_plus_token_cos_sim_megalist = [1-((1-anchor)+(1-token)) for anchor,token,combo in cos_shift_data]
-            combo_anchor_token_cos_sim_megalist = [combo for anchor,token,combo in cos_shift_data]
-            combo_minus_sum_cos_sim_megalist = [combo-(1-((1-anchor)+(1-token))) for anchor,token,combo in cos_shift_data]
             mean_anchor = np.mean(anchor_cos_sim_megalist)
             mean_token = np.mean(token_cos_sim_megalist)
             mean_sum = np.mean(anchor_plus_token_cos_sim_megalist)
             mean_test = np.mean(combo_anchor_token_cos_sim_megalist)
             mean_combo_minus_sum = np.mean(combo_minus_sum_cos_sim_megalist)
-            impact_components = [get_impact_component(value,gm) for value in combo_anchor_token_cos_sim_megalist]
-        impact_component = get_impact_component(mean_test,gm)
-        impact_component_percent = np.mean(impact_components)*100
-        data_i = [token,
-                  name,
-                  ensembl_id]
         if combos == 0:
             data_i += [mean_test]
         elif combos == 1:
-            data_i += [mean_anchor,
-                       mean_token,
-                       mean_sum,
-                       mean_test,
-                       mean_combo_minus_sum]
-        data_i += [impact_component,
-                   impact_component_percent]
-        cos_sims_df_i = pd.DataFrame(dict(zip(names,data_i)),index=[i])
-        cos_sims_full_df = pd.concat([cos_sims_full_df,cos_sims_df_i])
     # quantify number of detections of each gene
-    cos_sims_full_df["N_Detections"] = [n_detections(i,
-                                                     dict_list,
-                                                     "gene",
-                                                     anchor_token) for i in cos_sims_full_df["Gene"]]
     if combos == 0:
-        cos_sims_full_df = cos_sims_full_df.sort_values(by=["Impact_component",
-                                                            "Test_avg_shift"],
-                                                            ascending=[False,True])
     elif combos == 1:
-        cos_sims_full_df = cos_sims_full_df.sort_values(by=["Impact_component",
-                                                            "Combo_minus_sum_shift"],
-                                                            ascending=[False,True])
     return cos_sims_full_df
 class InSilicoPerturberStats:
     valid_option_dict = {
-        "mode": {"goal_state_shift","vs_null","mixture_model","aggregate_data"},
-        "combos": {0,1},
         "anchor_gene": {None, str},
         "cell_states_to_model": {None, dict},
     }
     def __init__(
         self,
         mode="mixture_model",
@@ -424,6 +637,7 @@ class InSilicoPerturberStats:
         combos=0,
         anchor_gene=None,
         cell_states_to_model=None,
         token_dictionary_file=TOKEN_DICTIONARY_FILE,
         gene_name_id_dictionary_file=GENE_NAME_ID_DICTIONARY_FILE,
     ):
@@ -432,12 +646,13 @@ class InSilicoPerturberStats:
         Parameters
         ----------
-        mode : {"goal_state_shift","vs_null","mixture_model","aggregate_data"}
             Type of stats.
             "goal_state_shift": perturbation vs. random for desired cell state shift
             "vs_null": perturbation vs. null from provided null distribution dataset
             "mixture_model": perturbation in impact vs. no impact component of mixture model (no goal direction)
             "aggregate_data": aggregates cosine shifts for single perturbation in multiple cells
         genes_perturbed : "all", list
             Genes perturbed in isp experiment.
             Default is assuming genes_to_perturb in isp experiment was "all" (each gene in each cell).
@@ -472,13 +687,14 @@ class InSilicoPerturberStats:
         self.combos = combos
         self.anchor_gene = anchor_gene
         self.cell_states_to_model = cell_states_to_model
         self.validate_options()
         # load token dictionary (Ensembl IDs:token)
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
         # load gene name dictionary (gene name:Ensembl ID)
         with open(gene_name_id_dictionary_file, "rb") as f:
             self.gene_name_id_dict = pickle.load(f)
@@ -489,7 +705,7 @@ class InSilicoPerturberStats:
             self.anchor_token = self.gene_token_dict[self.anchor_gene]
     def validate_options(self):
-        for attr_name,valid_options in self.valid_option_dict.items():
             attr_value = self.__dict__[attr_name]
             if type(attr_value) not in {list, dict}:
                 if attr_name in {"anchor_gene"}:
@@ -498,35 +714,40 @@ class InSilicoPerturberStats:
                     continue
             valid_type = False
             for option in valid_options:
-                if (option in [int,list,dict]) and isinstance(attr_value, option):
                     valid_type = True
                     break
-            if valid_type:
-                continue
-            logger.error(
-                f"Invalid option for {attr_name}. " \
-                f"Valid options for {attr_name}: {valid_options}"
-            )
-            raise
         if self.cell_states_to_model is not None:
             if len(self.cell_states_to_model.items()) == 1:
                 logger.warning(
-                    "The single value dictionary for cell_states_to_model will be " \
-                    "replaced with a dictionary with named keys for start, goal, and alternate states. " \
-                    "Please specify state_key, start_state, goal_state, and alt_states " \
-                    "in the cell_states_to_model dictionary for future use. " \
-                    "For example, cell_states_to_model={" \
-                            "'state_key': 'disease', " \
-                            "'start_state': 'dcm', " \
-                            "'goal_state': 'nf', " \
-                            "'alt_states': ['hcm', 'other1', 'other2']}"
                 )
-                for key,value in self.cell_states_to_model.items():
                     if (len(value) == 3) and isinstance(value, tuple):
-                        if isinstance(value[0],list) and isinstance(value[1],list) and isinstance(value[2],list):
                             if len(value[0]) == 1 and len(value[1]) == 1:
-                                all_values = value[0]+value[1]+value[2]
                                 if len(all_values) == len(set(all_values)):
                                     continue
                 # reformat to the new named key format
@@ -535,75 +756,93 @@ class InSilicoPerturberStats:
                     "state_key": list(self.cell_states_to_model.keys())[0],
                     "start_state": state_values[0][0],
                     "goal_state": state_values[1][0],
-                    "alt_states": state_values[2:][0]
                 }
-            elif set(self.cell_states_to_model.keys()) == {"state_key", "start_state", "goal_state", "alt_states"}:
-                if (self.cell_states_to_model["state_key"] is None) \
-                    or (self.cell_states_to_model["start_state"] is None) \
-                    or (self.cell_states_to_model["goal_state"] is None):
                     logger.error(
-                        "Please specify 'state_key', 'start_state', and 'goal_state' in cell_states_to_model.")
                     raise
-                if self.cell_states_to_model["start_state"] == self.cell_states_to_model["goal_state"]:
-                    logger.error(
-                        "All states must be unique.")
                     raise
                 if self.cell_states_to_model["alt_states"] is not None:
-                    if type(self.cell_states_to_model["alt_states"]) is not list:
                         logger.error(
                             "self.cell_states_to_model['alt_states'] must be a list (even if it is one element)."
                         )
                         raise
-                    if len(self.cell_states_to_model["alt_states"])!= len(set(self.cell_states_to_model["alt_states"])):
-                        logger.error(
-                            "All states must be unique.")
                         raise
             else:
                 logger.error(
-                    "cell_states_to_model must only have the following four keys: " \
-                    "'state_key', 'start_state', 'goal_state', 'alt_states'." \
-                    "For example, cell_states_to_model={" \
-                            "'state_key': 'disease', " \
-                            "'start_state': 'dcm', " \
-                            "'goal_state': 'nf', " \
-                            "'alt_states': ['hcm', 'other1', 'other2']}"
                 )
                 raise
             if self.anchor_gene is not None:
                 self.anchor_gene = None
                 logger.warning(
-                    "anchor_gene set to None. " \
-                    "Currently, anchor gene not available " \
-                    "when modeling multiple cell states.")
         if self.combos > 0:
             if self.anchor_gene is None:
                 logger.error(
-                    "Currently, stats are only supported for combination " \
-                    "in silico perturbation run with anchor gene. Please add " \
-                    "anchor gene when using with combos > 0. ")
                 raise
         if (self.mode == "mixture_model") and (self.genes_perturbed != "all"):
             logger.error(
-                    "Mixture model mode requires multiple gene perturbations to fit model " \
-                    "so is incompatible with a single grouped perturbation.")
             raise
         if (self.mode == "aggregate_data") and (self.genes_perturbed == "all"):
             logger.error(
-                    "Simple data aggregation mode is for single perturbation in multiple cells " \
-                    "so is incompatible with a genes_perturbed being 'all'.")
-            raise
-    def get_stats(self,
-                  input_data_directory,
-                  null_dist_data_directory,
-                  output_directory,
-                  output_prefix):
         """
         Get stats for in silico perturbation data and save as results in output_directory.
@@ -617,20 +856,22 @@ class InSilicoPerturberStats:
             Path to directory where perturbation data will be saved as .csv
         output_prefix : str
             Prefix for output .csv
         Outputs
         ----------
         Definition of possible columns in .csv output file.
         Of note, not all columns will be present in all output files.
         Some columns are specific to particular perturbation modes.
         "Gene": gene token
         "Gene_name": gene name
         "Ensembl_ID": gene Ensembl ID
         "N_Detections": number of cells in which each gene or gene combination was detected in the input dataset
         "Sig": 1 if FDR<0.05, otherwise 0
         "Shift_to_goal_end": cosine shift from start state towards goal end state in response to given perturbation
         "Shift_to_alt_end": cosine shift from start state towards alternate end state in response to given perturbation
         "Goal_end_vs_random_pval": pvalue of cosine shift from start state towards goal end state by Wilcoxon
@@ -639,7 +880,7 @@ class InSilicoPerturberStats:
             pvalue compares shift caused by perturbing given gene compared to random genes
         "Goal_end_FDR": Benjamini-Hochberg correction of "Goal_end_vs_random_pval"
         "Alt_end_FDR": Benjamini-Hochberg correction of "Alt_end_vs_random_pval"
         "Test_avg_shift": cosine shift in response to given perturbation in cells from test distribution
         "Null_avg_shift": cosine shift in response to given perturbation in cells from null distribution (e.g. random cells)
         "Test_vs_null_avg_shift": difference in cosine shift in cells from test vs. null distribution
@@ -648,7 +889,7 @@ class InSilicoPerturberStats:
         "Test_vs_null_FDR": Benjamini-Hochberg correction of "Test_vs_null_pval"
         "N_Detections_test": "N_Detections" in cells from test distribution
         "N_Detections_null": "N_Detections" in cells from null distribution
         "Anchor_shift": cosine shift in response to given perturbation of anchor gene
         "Test_token_shift": cosine shift in response to given perturbation of test gene
         "Sum_of_indiv_shifts": sum of cosine shifts in response to individually perturbing test and anchor genes
@@ -658,13 +899,27 @@ class InSilicoPerturberStats:
         "Impact_component": whether the given perturbation was modeled to be within the impact component by the mixture model
             1: within impact component; 0: not within impact component
         "Impact_component_percent": percent of cells in which given perturbation was modeled to be within impact component
         """
-        if self.mode not in ["goal_state_shift", "vs_null", "mixture_model","aggregate_data"]:
             logger.error(
-                "Currently, only modes available are stats for goal_state_shift, " \
-                "vs_null (comparing to null distribution), and " \
-                "mixture_model (fitting mixture model for perturbations with or without impact.")
             raise
         self.gene_token_id_dict = invert_dict(self.gene_token_dict)
@@ -673,44 +928,107 @@ class InSilicoPerturberStats:
         # obtain total gene list
         if (self.combos == 0) and (self.anchor_token is not None):
             # cos sim data for effect of gene perturbation on the embedding of each other gene
-            dict_list = read_dictionaries(input_data_directory, "gene", self.anchor_token)
             gene_list = get_gene_list(dict_list, "gene")
         else:
             # cos sim data for effect of gene perturbation on the embedding of each cell
-            dict_list = read_dictionaries(input_data_directory, "cell", self.anchor_token)
             gene_list = get_gene_list(dict_list, "cell")
         # initiate results dataframe
-        cos_sims_df_initial = pd.DataFrame({"Gene": gene_list,
-                                            "Gene_name": [self.token_to_gene_name(item) \
-                                                          for item in gene_list], \
-                                            "Ensembl_ID": [token_tuple_to_ensembl_ids(genes, self.gene_token_id_dict) \
-                                                           if self.genes_perturbed != "all" else \
-                                                           self.gene_token_id_dict[genes[1]] \
-                                                           if isinstance(genes,tuple) else \
-                                                           self.gene_token_id_dict[genes] \
-                                                           for genes in gene_list]}, \
-                                             index=[i for i in range(len(gene_list))])
         if self.mode == "goal_state_shift":
-            cos_sims_df = isp_stats_to_goal_state(cos_sims_df_initial, dict_list, self.cell_states_to_model, self.genes_perturbed)
         elif self.mode == "vs_null":
-            null_dict_list = read_dictionaries(null_dist_data_directory, "cell", self.anchor_token)
-            cos_sims_df = isp_stats_vs_null(cos_sims_df_initial, dict_list, null_dict_list)
         elif self.mode == "mixture_model":
-            cos_sims_df = isp_stats_mixture_model(cos_sims_df_initial, dict_list, self.combos, self.anchor_token)
         elif self.mode == "aggregate_data":
             cos_sims_df = isp_aggregate_grouped_perturb(cos_sims_df_initial, dict_list)
         # save perturbation stats to output_path
         output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")
         cos_sims_df.to_csv(output_path)
     def token_to_gene_name(self, item):
-        if isinstance(item,int):
-            return self.gene_id_name_dict.get(self.gene_token_id_dict.get(item, np.nan), np.nan)
-        if isinstance(item,tuple):
-            return tuple([self.gene_id_name_dict.get(self.gene_token_id_dict.get(i, np.nan), np.nan) for i in item])

   ispstats = InSilicoPerturberStats(mode="goal_state_shift",
                                     combos=0,
                                     anchor_gene=None,
+                                    cell_states_to_model={"state_key": "disease",
+                                                          "start_state": "dcm",
+                                                          "goal_state": "nf",
                                                           "alt_states": ["hcm", "other1", "other2"]})
   ispstats.get_stats("path/to/input_data",
                      None,
 """
 import logging
+import os
 import pickle
 import random
 from pathlib import Path
+import numpy as np
+import pandas as pd
+import statsmodels.stats.multitest as smt
 from scipy.stats import ranksums
 from sklearn.mixture import GaussianMixture
+from tqdm.auto import tqdm, trange
+from .perturber_utils import flatten_list, validate_cell_states_to_model
 from .tokenizer import TOKEN_DICTIONARY_FILE
 GENE_NAME_ID_DICTIONARY_FILE = Path(__file__).parent / "gene_name_id_dict.pkl"
 logger = logging.getLogger(__name__)
 # invert dictionary keys/values
 def invert_dict(dictionary):
     return {v: k for k, v in dictionary.items()}
+def read_dict(cos_sims_dict, cell_or_gene_emb, anchor_token):
+    if cell_or_gene_emb == "cell":
+        cell_emb_dict = {
+            k: v for k, v in cos_sims_dict.items() if v and "cell_emb" in k
+        }
+        return [cell_emb_dict]
+    elif cell_or_gene_emb == "gene":
+        if anchor_token is None:
+            gene_emb_dict = {k: v for k, v in cos_sims_dict.items() if v}
+        else:
+            gene_emb_dict = {
+                k: v for k, v in cos_sims_dict.items() if v and anchor_token == k[0]
+            }
+    return [gene_emb_dict]
 # read raw dictionary files
+def read_dictionaries(
+    input_data_directory,
+    cell_or_gene_emb,
+    anchor_token,
+    cell_states_to_model,
+    pickle_suffix,
+):
+    file_found = False
     file_path_list = []
+    if cell_states_to_model is None:
+        dict_list = []
+    else:
+        validate_cell_states_to_model(cell_states_to_model)
+        cell_states_to_model_valid = {
+            state: value
+            for state, value in cell_states_to_model.items()
+            if state != "state_key"
+            and cell_states_to_model[state] is not None
+            and cell_states_to_model[state] != []
+        }
+        cell_states_list = []
+        # flatten all state values into list
+        for state in cell_states_to_model_valid:
+            value = cell_states_to_model_valid[state]
+            if isinstance(value, list):
+                cell_states_list += value
+            else:
+                cell_states_list.append(value)
+        state_dict = {state_value: dict() for state_value in cell_states_list}
     for file in os.listdir(input_data_directory):
+        # process only files with given suffix (e.g. "_raw.pickle")
+        if file.endswith(pickle_suffix):
+            file_found = True
             file_path_list += [f"{input_data_directory}/{file}"]
     for file_path in tqdm(file_path_list):
         with open(file_path, "rb") as fp:
             cos_sims_dict = pickle.load(fp)
+            if cell_states_to_model is None:
+                dict_list += read_dict(cos_sims_dict, cell_or_gene_emb, anchor_token)
+            else:
+                for state_value in cell_states_list:
+                    new_dict = read_dict(
+                        cos_sims_dict[state_value], cell_or_gene_emb, anchor_token
+                    )[0]
+                    for key in new_dict:
+                        try:
+                            state_dict[state_value][key] += new_dict[key]
+                        except KeyError:
+                            state_dict[state_value][key] = new_dict[key]
+    if not file_found:
         logger.error(
+            "No raw data for processing found within provided directory. "
+            "Please ensure data files end with '{pickle_suffix}'."
+        )
         raise
+    if cell_states_to_model is None:
+        return dict_list
+    else:
+        return state_dict
 # get complete gene list
+def get_gene_list(dict_list, mode):
     if mode == "cell":
         position = 0
     elif mode == "gene":
         position = 1
     gene_set = set()
+    if isinstance(dict_list, list):
+        for dict_i in dict_list:
+            gene_set.update([k[position] for k, v in dict_i.items() if v])
+    elif isinstance(dict_list, dict):
+        for state, dict_i in dict_list.items():
+            gene_set.update([k[position] for k, v in dict_i.items() if v])
+    else:
+        logger.error(
+            "dict_list should be a list, or if modeling shift to goal states, a dict. "
+            f"{type(dict_list)} is not the correct format."
+        )
+        raise
     gene_list = list(gene_set)
     if mode == "gene":
         gene_list.remove("cell_emb")
     gene_list.sort()
     return gene_list
 def token_tuple_to_ensembl_ids(token_tuple, gene_token_id_dict):
+    try:
+        return tuple([gene_token_id_dict.get(i, np.nan) for i in token_tuple])
+    except TypeError:
+        return tuple(gene_token_id_dict.get(token_tuple, np.nan))
 def n_detections(token, dict_list, mode, anchor_token):
     cos_sim_megalist = []
     for dict_i in dict_list:
         if mode == "cell":
+            cos_sim_megalist += dict_i.get((token, "cell_emb"), [])
         elif mode == "gene":
+            cos_sim_megalist += dict_i.get((anchor_token, token), [])
     return len(cos_sim_megalist)
 def get_fdr(pvalues):
     return list(smt.multipletests(pvalues, alpha=0.05, method="fdr_bh")[1])
 def get_impact_component(test_value, gaussian_mixture_model):
     impact_border = gaussian_mixture_model.means_[0][0]
     nonimpact_border = gaussian_mixture_model.means_[1][0]
             impact_component = 1
     return impact_component
 # aggregate data for single perturbation in multiple cells
+def isp_aggregate_grouped_perturb(cos_sims_df, dict_list):
+    names = ["Cosine_shift"]
     cos_sims_full_df = pd.DataFrame(columns=names)
     cos_shift_data = []
     token = cos_sims_df["Gene"][0]
     for dict_i in dict_list:
+        cos_shift_data += dict_i.get((token, "cell_emb"), [])
     cos_sims_full_df["Cosine_shift"] = cos_shift_data
+    return cos_sims_full_df
+def find(variable, x):
+    try:
+        if x in variable:  # Test if variable is iterable and contains x
+            return True
+    except TypeError:
+        return x == variable  # Test if variable is x if non-iterable
+def isp_aggregate_gene_shifts(
+    cos_sims_df, dict_list, gene_token_id_dict, gene_id_name_dict
+):
+    cos_shift_data = dict()
+    for i in trange(cos_sims_df.shape[0]):
+        token = cos_sims_df["Gene"][i]
+        for dict_i in dict_list:
+            affected_pairs = [k for k, v in dict_i.items() if find(k[0], token)]
+            for key in affected_pairs:
+                if key in cos_shift_data.keys():
+                    cos_shift_data[key] += dict_i.get(key, [])
+                else:
+                    cos_shift_data[key] = dict_i.get(key, [])
+    cos_data_mean = {
+        k: [np.mean(v), np.std(v), len(v)] for k, v in cos_shift_data.items()
+    }
+    cos_sims_full_df = pd.DataFrame()
+    cos_sims_full_df["Perturbed"] = [k[0] for k, v in cos_data_mean.items()]
+    cos_sims_full_df["Gene_name"] = [
+        cos_sims_df[cos_sims_df["Gene"] == k[0]]["Gene_name"][0]
+        for k, v in cos_data_mean.items()
+    ]
+    cos_sims_full_df["Ensembl_ID"] = [
+        cos_sims_df[cos_sims_df["Gene"] == k[0]]["Ensembl_ID"][0]
+        for k, v in cos_data_mean.items()
+    ]
+    cos_sims_full_df["Affected"] = [k[1] for k, v in cos_data_mean.items()]
+    cos_sims_full_df["Affected_Gene_name"] = [
+        gene_id_name_dict.get(gene_token_id_dict.get(token, np.nan), np.nan)
+        for token in cos_sims_full_df["Affected"]
+    ]
+    cos_sims_full_df["Affected_Ensembl_ID"] = [
+        gene_token_id_dict.get(token, np.nan) for token in cos_sims_full_df["Affected"]
+    ]
+    cos_sims_full_df["Cosine_shift_mean"] = [v[0] for k, v in cos_data_mean.items()]
+    cos_sims_full_df["Cosine_shift_stdev"] = [v[1] for k, v in cos_data_mean.items()]
+    cos_sims_full_df["N_Detections"] = [v[2] for k, v in cos_data_mean.items()]
+    specific_val = "cell_emb"
+    cos_sims_full_df["temp"] = list(cos_sims_full_df["Affected"] == specific_val)
+    # reorder so cell embs are at the top and all are subordered by magnitude of cosine shift
+    cos_sims_full_df = cos_sims_full_df.sort_values(
+        by=(["temp", "Cosine_shift_mean"]), ascending=[False, False]
+    ).drop("temp", axis=1)
+    return cos_sims_full_df
 # stats comparing cos sim shifts towards goal state of test perturbations vs random perturbations
+def isp_stats_to_goal_state(
+    cos_sims_df, result_dict, cell_states_to_model, genes_perturbed
+):
+    if (
+        ("alt_states" not in cell_states_to_model.keys())
+        or (len(cell_states_to_model["alt_states"]) == 0)
+        or (cell_states_to_model["alt_states"] == [None])
+    ):
         alt_end_state_exists = False
+    elif (len(cell_states_to_model["alt_states"]) > 0) and (
+        cell_states_to_model["alt_states"] != [None]
+    ):
         alt_end_state_exists = True
     # for single perturbation in multiple cells, there are no random perturbations to compare to
     if genes_perturbed != "all":
+        cos_sims_full_df = pd.DataFrame()
+        cos_shift_data_end = []
         token = cos_sims_df["Gene"][0]
+        cos_shift_data_end += result_dict[cell_states_to_model["goal_state"]].get(
+            (token, "cell_emb"), []
+        )
+        cos_sims_full_df["Shift_to_goal_end"] = [np.mean(cos_shift_data_end)]
+        if alt_end_state_exists is True:
+            for alt_state in cell_states_to_model["alt_states"]:
+                cos_shift_data_alt_state = []
+                cos_shift_data_alt_state += result_dict.get(alt_state).get(
+                    (token, "cell_emb"), []
+                )
+                cos_sims_full_df[f"Shift_to_alt_end_{alt_state}"] = [
+                    np.mean(cos_shift_data_alt_state)
+                ]
         # sort by shift to desired state
+        cos_sims_full_df = cos_sims_full_df.sort_values(
+            by=["Shift_to_goal_end"], ascending=[False]
+        )
+        return cos_sims_full_df
     elif genes_perturbed == "all":
+        goal_end_random_megalist = []
+        if alt_end_state_exists is True:
+            alt_end_state_random_dict = {
+                alt_state: [] for alt_state in cell_states_to_model["alt_states"]
+            }
         for i in trange(cos_sims_df.shape[0]):
             token = cos_sims_df["Gene"][i]
+            goal_end_random_megalist += result_dict[
+                cell_states_to_model["goal_state"]
+            ].get((token, "cell_emb"), [])
+            if alt_end_state_exists is True:
+                for alt_state in cell_states_to_model["alt_states"]:
+                    alt_end_state_random_dict[alt_state] += result_dict[alt_state].get(
+                        (token, "cell_emb"), []
+                    )
         # downsample to improve speed of ranksums
         if len(goal_end_random_megalist) > 100_000:
             random.seed(42)
+            goal_end_random_megalist = random.sample(
+                goal_end_random_megalist, k=100_000
+            )
+        if alt_end_state_exists is True:
+            for alt_state in cell_states_to_model["alt_states"]:
+                if len(alt_end_state_random_dict[alt_state]) > 100_000:
+                    random.seed(42)
+                    alt_end_state_random_dict[alt_state] = random.sample(
+                        alt_end_state_random_dict[alt_state], k=100_000
+                    )
+        names = [
+            "Gene",
+            "Gene_name",
+            "Ensembl_ID",
+            "Shift_to_goal_end",
+            "Goal_end_vs_random_pval",
+        ]
+        if alt_end_state_exists is True:
+            [
+                names.append(f"Shift_to_alt_end_{alt_state}")
+                for alt_state in cell_states_to_model["alt_states"]
+            ]
+            names.append(names.pop(names.index("Goal_end_vs_random_pval")))
+            [
+                names.append(f"Alt_end_vs_random_pval_{alt_state}")
+                for alt_state in cell_states_to_model["alt_states"]
+            ]
         cos_sims_full_df = pd.DataFrame(columns=names)
+        n_detections_dict = dict()
         for i in trange(cos_sims_df.shape[0]):
             token = cos_sims_df["Gene"][i]
             name = cos_sims_df["Gene_name"][i]
             ensembl_id = cos_sims_df["Ensembl_ID"][i]
+            goal_end_cos_sim_megalist = result_dict[
+                cell_states_to_model["goal_state"]
+            ].get((token, "cell_emb"), [])
+            n_detections_dict[token] = len(goal_end_cos_sim_megalist)
+            mean_goal_end = np.mean(goal_end_cos_sim_megalist)
+            pval_goal_end = ranksums(
+                goal_end_random_megalist, goal_end_cos_sim_megalist
+            ).pvalue
+            if alt_end_state_exists is True:
+                alt_end_state_dict = {
+                    alt_state: [] for alt_state in cell_states_to_model["alt_states"]
+                }
+                for alt_state in cell_states_to_model["alt_states"]:
+                    alt_end_state_dict[alt_state] = result_dict[alt_state].get(
+                        (token, "cell_emb"), []
+                    )
+                    alt_end_state_dict[f"{alt_state}_mean"] = np.mean(
+                        alt_end_state_dict[alt_state]
+                    )
+                    alt_end_state_dict[f"{alt_state}_pval"] = ranksums(
+                        alt_end_state_random_dict[alt_state],
+                        alt_end_state_dict[alt_state],
+                    ).pvalue
+            results_dict = dict()
+            results_dict["Gene"] = token
+            results_dict["Gene_name"] = name
+            results_dict["Ensembl_ID"] = ensembl_id
+            results_dict["Shift_to_goal_end"] = mean_goal_end
+            results_dict["Goal_end_vs_random_pval"] = pval_goal_end
+            if alt_end_state_exists is True:
+                for alt_state in cell_states_to_model["alt_states"]:
+                    results_dict[f"Shift_to_alt_end_{alt_state}"] = alt_end_state_dict[
+                        f"{alt_state}_mean"
+                    ]
+                    results_dict[
+                        f"Alt_end_vs_random_pval_{alt_state}"
+                    ] = alt_end_state_dict[f"{alt_state}_pval"]
+            cos_sims_df_i = pd.DataFrame(results_dict, index=[i])
+            cos_sims_full_df = pd.concat([cos_sims_full_df, cos_sims_df_i])
+        cos_sims_full_df["Goal_end_FDR"] = get_fdr(
+            list(cos_sims_full_df["Goal_end_vs_random_pval"])
+        )
+        if alt_end_state_exists is True:
+            for alt_state in cell_states_to_model["alt_states"]:
+                cos_sims_full_df[f"Alt_end_FDR_{alt_state}"] = get_fdr(
+                    list(cos_sims_full_df[f"Alt_end_vs_random_pval_{alt_state}"])
+                )
         # quantify number of detections of each gene
+        cos_sims_full_df["N_Detections"] = [
+            n_detections_dict[token] for token in cos_sims_full_df["Gene"]
+        ]
+        # sort by shift to desired state
+        cos_sims_full_df["Sig"] = [
+            1 if fdr < 0.05 else 0 for fdr in cos_sims_full_df["Goal_end_FDR"]
+        ]
+        cos_sims_full_df = cos_sims_full_df.sort_values(
+            by=["Sig", "Shift_to_goal_end", "Goal_end_FDR"],
+            ascending=[False, False, True],
+        )
         return cos_sims_full_df
 # stats comparing cos sim shifts of test perturbations vs null distribution
 def isp_stats_vs_null(cos_sims_df, dict_list, null_dict_list):
     cos_sims_full_df = cos_sims_df.copy()
     cos_sims_full_df["Test_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
     cos_sims_full_df["Null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
+    cos_sims_full_df["Test_vs_null_avg_shift"] = np.zeros(
+        cos_sims_df.shape[0], dtype=float
+    )
     cos_sims_full_df["Test_vs_null_pval"] = np.zeros(cos_sims_df.shape[0], dtype=float)
     cos_sims_full_df["Test_vs_null_FDR"] = np.zeros(cos_sims_df.shape[0], dtype=float)
+    cos_sims_full_df["N_Detections_test"] = np.zeros(
+        cos_sims_df.shape[0], dtype="uint32"
+    )
+    cos_sims_full_df["N_Detections_null"] = np.zeros(
+        cos_sims_df.shape[0], dtype="uint32"
+    )
     for i in trange(cos_sims_df.shape[0]):
         token = cos_sims_df["Gene"][i]
         test_shifts = []
         null_shifts = []
         for dict_i in dict_list:
+            test_shifts += dict_i.get((token, "cell_emb"), [])
         for dict_i in null_dict_list:
+            null_shifts += dict_i.get((token, "cell_emb"), [])
         cos_sims_full_df.loc[i, "Test_avg_shift"] = np.mean(test_shifts)
         cos_sims_full_df.loc[i, "Null_avg_shift"] = np.mean(null_shifts)
+        cos_sims_full_df.loc[i, "Test_vs_null_avg_shift"] = np.mean(
+            test_shifts
+        ) - np.mean(null_shifts)
+        cos_sims_full_df.loc[i, "Test_vs_null_pval"] = ranksums(
+            test_shifts, null_shifts, nan_policy="omit"
+        ).pvalue
+        # remove nan values
+        cos_sims_full_df.Test_vs_null_pval = np.where(
+            np.isnan(cos_sims_full_df.Test_vs_null_pval),
+            1,
+            cos_sims_full_df.Test_vs_null_pval,
+        )
         cos_sims_full_df.loc[i, "N_Detections_test"] = len(test_shifts)
         cos_sims_full_df.loc[i, "N_Detections_null"] = len(null_shifts)
+    cos_sims_full_df["Test_vs_null_FDR"] = get_fdr(
+        cos_sims_full_df["Test_vs_null_pval"]
+    )
+    cos_sims_full_df["Sig"] = [
+        1 if fdr < 0.05 else 0 for fdr in cos_sims_full_df["Test_vs_null_FDR"]
+    ]
+    cos_sims_full_df = cos_sims_full_df.sort_values(
+        by=["Sig", "Test_vs_null_avg_shift", "Test_vs_null_FDR"],
+        ascending=[False, False, True],
+    )
     return cos_sims_full_df
 # stats for identifying perturbations with largest effect within a given set of cells
 # fits a mixture model to 2 components (impact vs. non-impact) and
 # reports the most likely component for each test perturbation
 # Note: because assumes given perturbation has a consistent effect in the cells tested,
 # we recommend only using the mixture model strategy with uniform cell populations
 def isp_stats_mixture_model(cos_sims_df, dict_list, combos, anchor_token):
+    names = ["Gene", "Gene_name", "Ensembl_ID"]
     if combos == 0:
         names += ["Test_avg_shift"]
     elif combos == 1:
+        names += [
+            "Anchor_shift",
+            "Test_token_shift",
+            "Sum_of_indiv_shifts",
+            "Combo_shift",
+            "Combo_minus_sum_shift",
+        ]
+    names += ["Impact_component", "Impact_component_percent"]
     cos_sims_full_df = pd.DataFrame(columns=names)
     avg_values = []
     gene_names = []
     for i in trange(cos_sims_df.shape[0]):
         token = cos_sims_df["Gene"][i]
         name = cos_sims_df["Gene_name"][i]
         ensembl_id = cos_sims_df["Ensembl_ID"][i]
         cos_shift_data = []
         for dict_i in dict_list:
             if (combos == 0) and (anchor_token is not None):
+                cos_shift_data += dict_i.get((anchor_token, token), [])
             else:
+                cos_shift_data += dict_i.get((token, "cell_emb"), [])
         # Extract values for current gene
         if combos == 0:
             test_values = cos_shift_data
         elif combos == 1:
             test_values = []
             for tup in cos_shift_data:
+                test_values.append(tup[2])
         if len(test_values) > 0:
             avg_value = np.mean(test_values)
             avg_values.append(avg_value)
             gene_names.append(name)
     # fit Gaussian mixture model to dataset of mean for each gene
     avg_values_to_fit = np.array(avg_values).reshape(-1, 1)
     gm = GaussianMixture(n_components=2, random_state=0).fit(avg_values_to_fit)
     for i in trange(cos_sims_df.shape[0]):
         token = cos_sims_df["Gene"][i]
         name = cos_sims_df["Gene_name"][i]
         for dict_i in dict_list:
             if (combos == 0) and (anchor_token is not None):
+                cos_shift_data += dict_i.get((anchor_token, token), [])
             else:
+                cos_shift_data += dict_i.get((token, "cell_emb"), [])
         if combos == 0:
             mean_test = np.mean(cos_shift_data)
+            impact_components = [
+                get_impact_component(value, gm) for value in cos_shift_data
+            ]
         elif combos == 1:
+            anchor_cos_sim_megalist = [
+                anchor for anchor, token, combo in cos_shift_data
+            ]
+            token_cos_sim_megalist = [token for anchor, token, combo in cos_shift_data]
+            anchor_plus_token_cos_sim_megalist = [
+                1 - ((1 - anchor) + (1 - token))
+                for anchor, token, combo in cos_shift_data
+            ]
+            combo_anchor_token_cos_sim_megalist = [
+                combo for anchor, token, combo in cos_shift_data
+            ]
+            combo_minus_sum_cos_sim_megalist = [
+                combo - (1 - ((1 - anchor) + (1 - token)))
+                for anchor, token, combo in cos_shift_data
+            ]
             mean_anchor = np.mean(anchor_cos_sim_megalist)
             mean_token = np.mean(token_cos_sim_megalist)
             mean_sum = np.mean(anchor_plus_token_cos_sim_megalist)
             mean_test = np.mean(combo_anchor_token_cos_sim_megalist)
             mean_combo_minus_sum = np.mean(combo_minus_sum_cos_sim_megalist)
+            impact_components = [
+                get_impact_component(value, gm)
+                for value in combo_anchor_token_cos_sim_megalist
+            ]
+        impact_component = get_impact_component(mean_test, gm)
+        impact_component_percent = np.mean(impact_components) * 100
+        data_i = [token, name, ensembl_id]
         if combos == 0:
             data_i += [mean_test]
         elif combos == 1:
+            data_i += [
+                mean_anchor,
+                mean_token,
+                mean_sum,
+                mean_test,
+                mean_combo_minus_sum,
+            ]
+        data_i += [impact_component, impact_component_percent]
+        cos_sims_df_i = pd.DataFrame(dict(zip(names, data_i)), index=[i])
+        cos_sims_full_df = pd.concat([cos_sims_full_df, cos_sims_df_i])
     # quantify number of detections of each gene
+    cos_sims_full_df["N_Detections"] = [
+        n_detections(i, dict_list, "gene", anchor_token)
+        for i in cos_sims_full_df["Gene"]
+    ]
     if combos == 0:
+        cos_sims_full_df = cos_sims_full_df.sort_values(
+            by=["Impact_component", "Test_avg_shift"], ascending=[False, True]
+        )
     elif combos == 1:
+        cos_sims_full_df = cos_sims_full_df.sort_values(
+            by=["Impact_component", "Combo_minus_sum_shift"], ascending=[False, True]
+        )
     return cos_sims_full_df
 class InSilicoPerturberStats:
     valid_option_dict = {
+        "mode": {
+            "goal_state_shift",
+            "vs_null",
+            "mixture_model",
+            "aggregate_data",
+            "aggregate_gene_shifts",
+        },
+        "genes_perturbed": {"all", list},
+        "combos": {0, 1},
         "anchor_gene": {None, str},
         "cell_states_to_model": {None, dict},
+        "pickle_suffix": {None, str},
     }
     def __init__(
         self,
         mode="mixture_model",
         combos=0,
         anchor_gene=None,
         cell_states_to_model=None,
+        pickle_suffix="_raw.pickle",
         token_dictionary_file=TOKEN_DICTIONARY_FILE,
         gene_name_id_dictionary_file=GENE_NAME_ID_DICTIONARY_FILE,
     ):
         Parameters
         ----------
+        mode : {"goal_state_shift","vs_null","mixture_model","aggregate_data","aggregate_gene_shifts"}
             Type of stats.
             "goal_state_shift": perturbation vs. random for desired cell state shift
             "vs_null": perturbation vs. null from provided null distribution dataset
             "mixture_model": perturbation in impact vs. no impact component of mixture model (no goal direction)
             "aggregate_data": aggregates cosine shifts for single perturbation in multiple cells
+            "aggregate_gene_shifts": aggregates cosine shifts of genes in response to perturbation(s)
         genes_perturbed : "all", list
             Genes perturbed in isp experiment.
             Default is assuming genes_to_perturb in isp experiment was "all" (each gene in each cell).
         self.combos = combos
         self.anchor_gene = anchor_gene
         self.cell_states_to_model = cell_states_to_model
+        self.pickle_suffix = pickle_suffix
         self.validate_options()
         # load token dictionary (Ensembl IDs:token)
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
         # load gene name dictionary (gene name:Ensembl ID)
         with open(gene_name_id_dictionary_file, "rb") as f:
             self.gene_name_id_dict = pickle.load(f)
             self.anchor_token = self.gene_token_dict[self.anchor_gene]
     def validate_options(self):
+        for attr_name, valid_options in self.valid_option_dict.items():
             attr_value = self.__dict__[attr_name]
             if type(attr_value) not in {list, dict}:
                 if attr_name in {"anchor_gene"}:
                     continue
             valid_type = False
             for option in valid_options:
+                if (option in [str, int, list, dict]) and isinstance(
+                    attr_value, option
+                ):
                     valid_type = True
                     break
+            if not valid_type:
+                logger.error(
+                    f"Invalid option for {attr_name}. "
+                    f"Valid options for {attr_name}: {valid_options}"
+                )
+                raise
         if self.cell_states_to_model is not None:
             if len(self.cell_states_to_model.items()) == 1:
                 logger.warning(
+                    "The single value dictionary for cell_states_to_model will be "
+                    "replaced with a dictionary with named keys for start, goal, and alternate states. "
+                    "Please specify state_key, start_state, goal_state, and alt_states "
+                    "in the cell_states_to_model dictionary for future use. "
+                    "For example, cell_states_to_model={"
+                    "'state_key': 'disease', "
+                    "'start_state': 'dcm', "
+                    "'goal_state': 'nf', "
+                    "'alt_states': ['hcm', 'other1', 'other2']}"
                 )
+                for key, value in self.cell_states_to_model.items():
                     if (len(value) == 3) and isinstance(value, tuple):
+                        if (
+                            isinstance(value[0], list)
+                            and isinstance(value[1], list)
+                            and isinstance(value[2], list)
+                        ):
                             if len(value[0]) == 1 and len(value[1]) == 1:
+                                all_values = value[0] + value[1] + value[2]
                                 if len(all_values) == len(set(all_values)):
                                     continue
                 # reformat to the new named key format
                     "state_key": list(self.cell_states_to_model.keys())[0],
                     "start_state": state_values[0][0],
                     "goal_state": state_values[1][0],
+                    "alt_states": state_values[2:][0],
                 }
+            elif set(self.cell_states_to_model.keys()) == {
+                "state_key",
+                "start_state",
+                "goal_state",
+                "alt_states",
+            }:
+                if (
+                    (self.cell_states_to_model["state_key"] is None)
+                    or (self.cell_states_to_model["start_state"] is None)
+                    or (self.cell_states_to_model["goal_state"] is None)
+                ):
                     logger.error(
+                        "Please specify 'state_key', 'start_state', and 'goal_state' in cell_states_to_model."
+                    )
                     raise
+                if (
+                    self.cell_states_to_model["start_state"]
+                    == self.cell_states_to_model["goal_state"]
+                ):
+                    logger.error("All states must be unique.")
                     raise
                 if self.cell_states_to_model["alt_states"] is not None:
+                    if not isinstance(self.cell_states_to_model["alt_states"], list):
                         logger.error(
                             "self.cell_states_to_model['alt_states'] must be a list (even if it is one element)."
                         )
                         raise
+                    if len(self.cell_states_to_model["alt_states"]) != len(
+                        set(self.cell_states_to_model["alt_states"])
+                    ):
+                        logger.error("All states must be unique.")
                         raise
             else:
                 logger.error(
+                    "cell_states_to_model must only have the following four keys: "
+                    "'state_key', 'start_state', 'goal_state', 'alt_states'."
+                    "For example, cell_states_to_model={"
+                    "'state_key': 'disease', "
+                    "'start_state': 'dcm', "
+                    "'goal_state': 'nf', "
+                    "'alt_states': ['hcm', 'other1', 'other2']}"
                 )
                 raise
             if self.anchor_gene is not None:
                 self.anchor_gene = None
                 logger.warning(
+                    "anchor_gene set to None. "
+                    "Currently, anchor gene not available "
+                    "when modeling multiple cell states."
+                )
         if self.combos > 0:
             if self.anchor_gene is None:
                 logger.error(
+                    "Currently, stats are only supported for combination "
+                    "in silico perturbation run with anchor gene. Please add "
+                    "anchor gene when using with combos > 0. "
+                )
                 raise
         if (self.mode == "mixture_model") and (self.genes_perturbed != "all"):
             logger.error(
+                "Mixture model mode requires multiple gene perturbations to fit model "
+                "so is incompatible with a single grouped perturbation."
+            )
             raise
         if (self.mode == "aggregate_data") and (self.genes_perturbed == "all"):
             logger.error(
+                "Simple data aggregation mode is for single perturbation in multiple cells "
+                "so is incompatible with a genes_perturbed being 'all'."
+            )
+            raise
+    def get_stats(
+        self,
+        input_data_directory,
+        null_dist_data_directory,
+        output_directory,
+        output_prefix,
+        null_dict_list=None,
+    ):
         """
         Get stats for in silico perturbation data and save as results in output_directory.
             Path to directory where perturbation data will be saved as .csv
         output_prefix : str
             Prefix for output .csv
+        null_dict_list: dict
+            List of loaded null distribtion dictionary if more than one comparison vs. the null is to be performed
         Outputs
         ----------
         Definition of possible columns in .csv output file.
         Of note, not all columns will be present in all output files.
         Some columns are specific to particular perturbation modes.
         "Gene": gene token
         "Gene_name": gene name
         "Ensembl_ID": gene Ensembl ID
         "N_Detections": number of cells in which each gene or gene combination was detected in the input dataset
         "Sig": 1 if FDR<0.05, otherwise 0
         "Shift_to_goal_end": cosine shift from start state towards goal end state in response to given perturbation
         "Shift_to_alt_end": cosine shift from start state towards alternate end state in response to given perturbation
         "Goal_end_vs_random_pval": pvalue of cosine shift from start state towards goal end state by Wilcoxon
             pvalue compares shift caused by perturbing given gene compared to random genes
         "Goal_end_FDR": Benjamini-Hochberg correction of "Goal_end_vs_random_pval"
         "Alt_end_FDR": Benjamini-Hochberg correction of "Alt_end_vs_random_pval"
         "Test_avg_shift": cosine shift in response to given perturbation in cells from test distribution
         "Null_avg_shift": cosine shift in response to given perturbation in cells from null distribution (e.g. random cells)
         "Test_vs_null_avg_shift": difference in cosine shift in cells from test vs. null distribution
         "Test_vs_null_FDR": Benjamini-Hochberg correction of "Test_vs_null_pval"
         "N_Detections_test": "N_Detections" in cells from test distribution
         "N_Detections_null": "N_Detections" in cells from null distribution
         "Anchor_shift": cosine shift in response to given perturbation of anchor gene
         "Test_token_shift": cosine shift in response to given perturbation of test gene
         "Sum_of_indiv_shifts": sum of cosine shifts in response to individually perturbing test and anchor genes
         "Impact_component": whether the given perturbation was modeled to be within the impact component by the mixture model
             1: within impact component; 0: not within impact component
         "Impact_component_percent": percent of cells in which given perturbation was modeled to be within impact component
+        In case of aggregating gene shifts:
+        "Perturbed": ID(s) of gene(s) being perturbed
+        "Affected": ID of affected gene or "cell_emb" indicating the impact on the cell embedding as a whole
+        "Cosine_shift_mean": mean of cosine shift of modeled perturbation on affected gene or cell
+        "Cosine_shift_stdev": standard deviation of cosine shift of modeled perturbation on affected gene or cell
         """
+        if self.mode not in [
+            "goal_state_shift",
+            "vs_null",
+            "mixture_model",
+            "aggregate_data",
+            "aggregate_gene_shifts",
+        ]:
             logger.error(
+                "Currently, only modes available are stats for goal_state_shift, "
+                "vs_null (comparing to null distribution), "
+                "mixture_model (fitting mixture model for perturbations with or without impact), "
+                "and aggregating data for single perturbations or for gene embedding shifts."
+            )
             raise
         self.gene_token_id_dict = invert_dict(self.gene_token_dict)
         # obtain total gene list
         if (self.combos == 0) and (self.anchor_token is not None):
             # cos sim data for effect of gene perturbation on the embedding of each other gene
+            dict_list = read_dictionaries(
+                input_data_directory,
+                "gene",
+                self.anchor_token,
+                self.cell_states_to_model,
+                self.pickle_suffix,
+            )
             gene_list = get_gene_list(dict_list, "gene")
+        elif (
+            (self.combos == 0)
+            and (self.anchor_token is None)
+            and (self.mode == "aggregate_gene_shifts")
+        ):
+            dict_list = read_dictionaries(
+                input_data_directory,
+                "gene",
+                self.anchor_token,
+                self.cell_states_to_model,
+                self.pickle_suffix,
+            )
+            gene_list = get_gene_list(dict_list, "cell")
         else:
             # cos sim data for effect of gene perturbation on the embedding of each cell
+            dict_list = read_dictionaries(
+                input_data_directory,
+                "cell",
+                self.anchor_token,
+                self.cell_states_to_model,
+                self.pickle_suffix,
+            )
             gene_list = get_gene_list(dict_list, "cell")
         # initiate results dataframe
+        cos_sims_df_initial = pd.DataFrame(
+            {
+                "Gene": gene_list,
+                "Gene_name": [self.token_to_gene_name(item) for item in gene_list],
+                "Ensembl_ID": [
+                    token_tuple_to_ensembl_ids(genes, self.gene_token_id_dict)
+                    if self.genes_perturbed != "all"
+                    else self.gene_token_id_dict[genes[1]]
+                    if isinstance(genes, tuple)
+                    else self.gene_token_id_dict[genes]
+                    for genes in gene_list
+                ],
+            },
+            index=[i for i in range(len(gene_list))],
+        )
         if self.mode == "goal_state_shift":
+            cos_sims_df = isp_stats_to_goal_state(
+                cos_sims_df_initial,
+                dict_list,
+                self.cell_states_to_model,
+                self.genes_perturbed,
+            )
         elif self.mode == "vs_null":
+            if null_dict_list is None:
+                null_dict_list = read_dictionaries(
+                    null_dist_data_directory,
+                    "cell",
+                    self.anchor_token,
+                    self.cell_states_to_model,
+                    self.pickle_suffix,
+                )
+            cos_sims_df = isp_stats_vs_null(
+                cos_sims_df_initial, dict_list, null_dict_list
+            )
         elif self.mode == "mixture_model":
+            cos_sims_df = isp_stats_mixture_model(
+                cos_sims_df_initial, dict_list, self.combos, self.anchor_token
+            )
         elif self.mode == "aggregate_data":
             cos_sims_df = isp_aggregate_grouped_perturb(cos_sims_df_initial, dict_list)
+        elif self.mode == "aggregate_gene_shifts":
+            cos_sims_df = isp_aggregate_gene_shifts(
+                cos_sims_df_initial,
+                dict_list,
+                self.gene_token_id_dict,
+                self.gene_id_name_dict,
+            )
         # save perturbation stats to output_path
         output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")
         cos_sims_df.to_csv(output_path)
     def token_to_gene_name(self, item):
+        if isinstance(item, int):
+            return self.gene_id_name_dict.get(
+                self.gene_token_id_dict.get(item, np.nan), np.nan
+            )
+        if isinstance(item, tuple):
+            return tuple(
+                [
+                    self.gene_id_name_dict.get(
+                        self.gene_token_id_dict.get(i, np.nan), np.nan
+                    )
+                    for i in item
+                ]
+            )

geneformer/perturber_utils.py ADDED Viewed

	@@ -0,0 +1,698 @@

+import itertools as it
+import logging
+import pickle
+import re
+from collections import defaultdict
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import torch
+from datasets import Dataset, load_from_disk
+from transformers import (
+    BertForMaskedLM,
+    BertForSequenceClassification,
+    BertForTokenClassification,
+)
+sns.set()
+logger = logging.getLogger(__name__)
+# load data and filter by defined criteria
+def load_and_filter(filter_data, nproc, input_data_file):
+    data = load_from_disk(input_data_file)
+    if filter_data is not None:
+        data = filter_by_dict(data, filter_data, nproc)
+    return data
+def filter_by_dict(data, filter_data, nproc):
+    for key, value in filter_data.items():
+        def filter_data_by_criteria(example):
+            return example[key] in value
+        data = data.filter(filter_data_by_criteria, num_proc=nproc)
+    if len(data) == 0:
+        logger.error("No cells remain after filtering. Check filtering criteria.")
+        raise
+    return data
+def filter_data_by_tokens(filtered_input_data, tokens, nproc):
+    def if_has_tokens(example):
+        return len(set(example["input_ids"]).intersection(tokens)) == len(tokens)
+    filtered_input_data = filtered_input_data.filter(if_has_tokens, num_proc=nproc)
+    return filtered_input_data
+def logging_filtered_data_len(filtered_input_data, filtered_tokens_categ):
+    if len(filtered_input_data) == 0:
+        logger.error(f"No cells in dataset contain {filtered_tokens_categ}.")
+        raise
+    else:
+        logger.info(f"# cells with {filtered_tokens_categ}: {len(filtered_input_data)}")
+def filter_data_by_tokens_and_log(
+    filtered_input_data, tokens, nproc, filtered_tokens_categ
+):
+    # filter for cells with anchor gene
+    filtered_input_data = filter_data_by_tokens(filtered_input_data, tokens, nproc)
+    # logging length of filtered data
+    logging_filtered_data_len(filtered_input_data, filtered_tokens_categ)
+    return filtered_input_data
+def filter_data_by_start_state(filtered_input_data, cell_states_to_model, nproc):
+    # confirm that start state is valid to prevent futile filtering
+    state_key = cell_states_to_model["state_key"]
+    state_values = filtered_input_data[state_key]
+    start_state = cell_states_to_model["start_state"]
+    if start_state not in state_values:
+        logger.error(
+            f"Start state {start_state} is not present "
+            f"in the dataset's {state_key} attribute."
+        )
+        raise
+    # filter for start state cells
+    def filter_for_origin(example):
+        return example[state_key] in [start_state]
+    filtered_input_data = filtered_input_data.filter(filter_for_origin, num_proc=nproc)
+    return filtered_input_data
+def slice_by_inds_to_perturb(filtered_input_data, cell_inds_to_perturb):
+    if cell_inds_to_perturb["start"] >= len(filtered_input_data):
+        logger.error(
+            "cell_inds_to_perturb['start'] is larger than the filtered dataset."
+        )
+        raise
+    if cell_inds_to_perturb["end"] > len(filtered_input_data):
+        logger.warning(
+            "cell_inds_to_perturb['end'] is larger than the filtered dataset. \
+                       Setting to the end of the filtered dataset."
+        )
+        cell_inds_to_perturb["end"] = len(filtered_input_data)
+    filtered_input_data = filtered_input_data.select(
+        [i for i in range(cell_inds_to_perturb["start"], cell_inds_to_perturb["end"])]
+    )
+    return filtered_input_data
+# load model to GPU
+def load_model(model_type, num_classes, model_directory):
+    if model_type == "Pretrained":
+        model = BertForMaskedLM.from_pretrained(
+            model_directory, output_hidden_states=True, output_attentions=False
+        )
+    elif model_type == "GeneClassifier":
+        model = BertForTokenClassification.from_pretrained(
+            model_directory,
+            num_labels=num_classes,
+            output_hidden_states=True,
+            output_attentions=False,
+        )
+    elif model_type == "CellClassifier":
+        model = BertForSequenceClassification.from_pretrained(
+            model_directory,
+            num_labels=num_classes,
+            output_hidden_states=True,
+            output_attentions=False,
+        )
+    # put the model in eval mode for fwd pass
+    model.eval()
+    model = model.to("cuda:0")
+    return model
+def quant_layers(model):
+    layer_nums = []
+    for name, parameter in model.named_parameters():
+        if "layer" in name:
+            layer_nums += [int(name.split("layer.")[1].split(".")[0])]
+    return int(max(layer_nums)) + 1
+def get_model_input_size(model):
+    return int(re.split("\(|,", str(model.bert.embeddings.position_embeddings))[1])
+def flatten_list(megalist):
+    return [item for sublist in megalist for item in sublist]
+def measure_length(example):
+    example["length"] = len(example["input_ids"])
+    return example
+def downsample_and_sort(data, max_ncells):
+    num_cells = len(data)
+    # if max number of cells is defined, then shuffle and subsample to this max number
+    if max_ncells is not None:
+        if num_cells > max_ncells:
+            data = data.shuffle(seed=42)
+            num_cells = max_ncells
+    data_subset = data.select([i for i in range(num_cells)])
+    # sort dataset with largest cell first to encounter any memory errors earlier
+    data_sorted = data_subset.sort("length", reverse=True)
+    return data_sorted
+def get_possible_states(cell_states_to_model):
+    possible_states = []
+    for key in ["start_state", "goal_state"]:
+        possible_states += [cell_states_to_model[key]]
+    possible_states += cell_states_to_model.get("alt_states", [])
+    return possible_states
+def forward_pass_single_cell(model, example_cell, layer_to_quant):
+    example_cell.set_format(type="torch")
+    input_data = example_cell["input_ids"]
+    with torch.no_grad():
+        outputs = model(input_ids=input_data.to("cuda"))
+    emb = torch.squeeze(outputs.hidden_states[layer_to_quant])
+    del outputs
+    return emb
+def perturb_emb_by_index(emb, indices):
+    mask = torch.ones(emb.numel(), dtype=torch.bool)
+    mask[indices] = False
+    return emb[mask]
+def delete_indices(example):
+    indices = example["perturb_index"]
+    if any(isinstance(el, list) for el in indices):
+        indices = flatten_list(indices)
+    for index in sorted(indices, reverse=True):
+        del example["input_ids"][index]
+    example["length"] = len(example["input_ids"])
+    return example
+# for genes_to_perturb = "all" where only genes within cell are overexpressed
+def overexpress_indices(example):
+    indices = example["perturb_index"]
+    if any(isinstance(el, list) for el in indices):
+        indices = flatten_list(indices)
+    for index in sorted(indices, reverse=True):
+        example["input_ids"].insert(0, example["input_ids"].pop(index))
+    example["length"] = len(example["input_ids"])
+    return example
+# for genes_to_perturb = list of genes to overexpress that are not necessarily expressed in cell
+def overexpress_tokens(example, max_len):
+    # -100 indicates tokens to overexpress are not present in rank value encoding
+    if example["perturb_index"] != [-100]:
+        example = delete_indices(example)
+    [
+        example["input_ids"].insert(0, token)
+        for token in example["tokens_to_perturb"][::-1]
+    ]
+    # truncate to max input size, must also truncate original emb to be comparable
+    if len(example["input_ids"]) > max_len:
+        example["input_ids"] = example["input_ids"][0:max_len]
+    example["length"] = len(example["input_ids"])
+    return example
+def calc_n_overflow(max_len, example_len, tokens_to_perturb, indices_to_perturb):
+    n_to_add = len(tokens_to_perturb) - len(indices_to_perturb)
+    n_overflow = example_len + n_to_add - max_len
+    return n_overflow
+def truncate_by_n_overflow(example):
+    new_max_len = example["length"] - example["n_overflow"]
+    example["input_ids"] = example["input_ids"][0:new_max_len]
+    example["length"] = len(example["input_ids"])
+    return example
+def remove_indices_from_emb(emb, indices_to_remove, gene_dim):
+    # indices_to_remove is list of indices to remove
+    indices_to_keep = [
+        i for i in range(emb.size()[gene_dim]) if i not in indices_to_remove
+    ]
+    num_dims = emb.dim()
+    emb_slice = [
+        slice(None) if dim != gene_dim else indices_to_keep for dim in range(num_dims)
+    ]
+    sliced_emb = emb[emb_slice]
+    return sliced_emb
+def remove_indices_from_emb_batch(emb_batch, list_of_indices_to_remove, gene_dim):
+    output_batch_list = [
+        remove_indices_from_emb(emb_batch[i, :, :], idxes, gene_dim - 1)
+        for i, idxes in enumerate(list_of_indices_to_remove)
+    ]
+    # add padding given genes are sometimes added that are or are not in original cell
+    batch_max = max([emb.size()[gene_dim - 1] for emb in output_batch_list])
+    output_batch_list_padded = [
+        pad_xd_tensor(emb, 0.000, batch_max, gene_dim - 1) for emb in output_batch_list
+    ]
+    return torch.stack(output_batch_list_padded)
+# removes perturbed indices
+# need to handle the various cases where a set of genes is overexpressed
+def remove_perturbed_indices_set(
+    emb,
+    perturb_type: str,
+    indices_to_perturb: list[list],
+    tokens_to_perturb: list[list],
+    original_lengths: list[int],
+    input_ids=None,
+):
+    if perturb_type == "overexpress":
+        num_perturbed = len(tokens_to_perturb)
+        if num_perturbed == 1:
+            indices_to_perturb_orig = [
+                idx if idx != [-100] else [None] for idx in indices_to_perturb
+            ]
+            if all(v is [None] for v in indices_to_perturb_orig):
+                return emb
+        else:
+            indices_to_perturb_orig = []
+            for idx_list in indices_to_perturb:
+                indices_to_perturb_orig.append(
+                    [idx if idx != [-100] else [None] for idx in idx_list]
+                )
+    else:
+        indices_to_perturb_orig = indices_to_perturb
+    emb = remove_indices_from_emb_batch(emb, indices_to_perturb_orig, gene_dim=1)
+    return emb
+def make_perturbation_batch(
+    example_cell, perturb_type, tokens_to_perturb, anchor_token, combo_lvl, num_proc
+) -> tuple[Dataset, list[int]]:
+    if combo_lvl == 0 and tokens_to_perturb == "all":
+        if perturb_type in ["overexpress", "activate"]:
+            range_start = 1
+        elif perturb_type in ["delete", "inhibit"]:
+            range_start = 0
+        indices_to_perturb = [
+            [i] for i in range(range_start, example_cell["length"][0])
+        ]
+    # elif combo_lvl > 0 and anchor_token is None:
+    ## to implement
+    elif combo_lvl > 0 and (anchor_token is not None):
+        example_input_ids = example_cell["input_ids"][0]
+        anchor_index = example_input_ids.index(anchor_token[0])
+        indices_to_perturb = [
+            sorted([anchor_index, i]) if i != anchor_index else None
+            for i in range(example_cell["length"][0])
+        ]
+        indices_to_perturb = [item for item in indices_to_perturb if item is not None]
+    else:
+        example_input_ids = example_cell["input_ids"][0]
+        indices_to_perturb = [
+            [example_input_ids.index(token)] if token in example_input_ids else None
+            for token in tokens_to_perturb
+        ]
+        indices_to_perturb = [item for item in indices_to_perturb if item is not None]
+    # create all permutations of combo_lvl of modifiers from tokens_to_perturb
+    if combo_lvl > 0 and (anchor_token is None):
+        if tokens_to_perturb != "all":
+            if len(tokens_to_perturb) == combo_lvl + 1:
+                indices_to_perturb = [
+                    list(x) for x in it.combinations(indices_to_perturb, combo_lvl + 1)
+                ]
+        else:
+            all_indices = [[i] for i in range(example_cell["length"][0])]
+            all_indices = [
+                index for index in all_indices if index not in indices_to_perturb
+            ]
+            indices_to_perturb = [
+                [[j for i in indices_to_perturb for j in i], x] for x in all_indices
+            ]
+    length = len(indices_to_perturb)
+    perturbation_dataset = Dataset.from_dict(
+        {
+            "input_ids": example_cell["input_ids"] * length,
+            "perturb_index": indices_to_perturb,
+        }
+    )
+    if length < 400:
+        num_proc_i = 1
+    else:
+        num_proc_i = num_proc
+    if perturb_type == "delete":
+        perturbation_dataset = perturbation_dataset.map(
+            delete_indices, num_proc=num_proc_i
+        )
+    elif perturb_type == "overexpress":
+        perturbation_dataset = perturbation_dataset.map(
+            overexpress_indices, num_proc=num_proc_i
+        )
+    perturbation_dataset = perturbation_dataset.map(measure_length, num_proc=num_proc_i)
+    return perturbation_dataset, indices_to_perturb
+# perturbed cell emb removing the activated/overexpressed/inhibited gene emb
+# so that only non-perturbed gene embeddings are compared to each other
+# in original or perturbed context
+def make_comparison_batch(original_emb_batch, indices_to_perturb, perturb_group):
+    all_embs_list = []
+    # if making comparison batch for multiple perturbations in single cell
+    if perturb_group is False:
+        # squeeze if single cell
+        if original_emb_batch.ndim == 3 and original_emb_batch.size()[0] == 1:
+            original_emb_batch = torch.squeeze(original_emb_batch)
+        original_emb_list = [original_emb_batch] * len(indices_to_perturb)
+    # if making comparison batch for single perturbation in multiple cells
+    elif perturb_group is True:
+        original_emb_list = original_emb_batch
+    for original_emb, indices in zip(original_emb_list, indices_to_perturb):
+        if indices == [-100]:
+            all_embs_list += [original_emb[:]]
+            continue
+        emb_list = []
+        start = 0
+        if any(isinstance(el, list) for el in indices):
+            indices = flatten_list(indices)
+        # removes indices that were perturbed from the original embedding
+        for i in sorted(indices):
+            emb_list += [original_emb[start:i]]
+            start = i + 1
+        emb_list += [original_emb[start:]]
+        all_embs_list += [torch.cat(emb_list)]
+    len_set = set([emb.size()[0] for emb in all_embs_list])
+    if len(len_set) > 1:
+        max_len = max(len_set)
+        all_embs_list = [pad_2d_tensor(emb, None, max_len, 0) for emb in all_embs_list]
+    return torch.stack(all_embs_list)
+def pad_list(input_ids, pad_token_id, max_len):
+    input_ids = np.pad(
+        input_ids,
+        (0, max_len - len(input_ids)),
+        mode="constant",
+        constant_values=pad_token_id,
+    )
+    return input_ids
+def pad_xd_tensor(tensor, pad_token_id, max_len, dim):
+    padding_length = max_len - tensor.size()[dim]
+    # Construct a padding configuration where all padding values are 0, except for the padding dimension
+    # 2 * number of dimensions (padding before and after for every dimension)
+    pad_config = [0] * 2 * tensor.dim()
+    # Set the padding after the desired dimension to the calculated padding length
+    pad_config[-2 * dim - 1] = padding_length
+    return torch.nn.functional.pad(
+        tensor, pad=pad_config, mode="constant", value=pad_token_id
+    )
+def pad_tensor(tensor, pad_token_id, max_len):
+    tensor = torch.nn.functional.pad(
+        tensor, pad=(0, max_len - tensor.numel()), mode="constant", value=pad_token_id
+    )
+    return tensor
+def pad_2d_tensor(tensor, pad_token_id, max_len, dim):
+    if dim == 0:
+        pad = (0, 0, 0, max_len - tensor.size()[dim])
+    elif dim == 1:
+        pad = (0, max_len - tensor.size()[dim], 0, 0)
+    tensor = torch.nn.functional.pad(
+        tensor, pad=pad, mode="constant", value=pad_token_id
+    )
+    return tensor
+def pad_3d_tensor(tensor, pad_token_id, max_len, dim):
+    if dim == 0:
+        raise Exception("dim 0 usually does not need to be padded.")
+    if dim == 1:
+        pad = (0, 0, 0, max_len - tensor.size()[dim])
+    elif dim == 2:
+        pad = (0, max_len - tensor.size()[dim], 0, 0)
+    tensor = torch.nn.functional.pad(
+        tensor, pad=pad, mode="constant", value=pad_token_id
+    )
+    return tensor
+def pad_or_truncate_encoding(encoding, pad_token_id, max_len):
+    if isinstance(encoding, torch.Tensor):
+        encoding_len = encoding.size()[0]
+    elif isinstance(encoding, list):
+        encoding_len = len(encoding)
+    if encoding_len > max_len:
+        encoding = encoding[0:max_len]
+    elif encoding_len < max_len:
+        if isinstance(encoding, torch.Tensor):
+            encoding = pad_tensor(encoding, pad_token_id, max_len)
+        elif isinstance(encoding, list):
+            encoding = pad_list(encoding, pad_token_id, max_len)
+    return encoding
+# pad list of tensors and convert to tensor
+def pad_tensor_list(
+    tensor_list,
+    dynamic_or_constant,
+    pad_token_id,
+    model_input_size,
+    dim=None,
+    padding_func=None,
+):
+    # determine maximum tensor length
+    if dynamic_or_constant == "dynamic":
+        max_len = max([tensor.squeeze().numel() for tensor in tensor_list])
+    elif isinstance(dynamic_or_constant, int):
+        max_len = dynamic_or_constant
+    else:
+        max_len = model_input_size
+        logger.warning(
+            "If padding style is constant, must provide integer value. "
+            f"Setting padding to max input size {model_input_size}."
+        )
+    # pad all tensors to maximum length
+    if dim is None:
+        tensor_list = [
+            pad_tensor(tensor, pad_token_id, max_len) for tensor in tensor_list
+        ]
+    else:
+        tensor_list = [
+            padding_func(tensor, pad_token_id, max_len, dim) for tensor in tensor_list
+        ]
+    # return stacked tensors
+    if padding_func != pad_3d_tensor:
+        return torch.stack(tensor_list)
+    else:
+        return torch.cat(tensor_list, 0)
+def gen_attention_mask(minibatch_encoding, max_len=None):
+    if max_len is None:
+        max_len = max(minibatch_encoding["length"])
+    original_lens = minibatch_encoding["length"]
+    attention_mask = [
+        [1] * original_len + [0] * (max_len - original_len)
+        if original_len <= max_len
+        else [1] * max_len
+        for original_len in original_lens
+    ]
+    return torch.tensor(attention_mask, device="cuda")
+# get cell embeddings excluding padding
+def mean_nonpadding_embs(embs, original_lens, dim=1):
+    # create a mask tensor based on padding lengths
+    mask = torch.arange(embs.size(dim), device=embs.device) < original_lens.unsqueeze(1)
+    if embs.dim() == 3:
+        # fill the masked positions in embs with zeros
+        masked_embs = embs.masked_fill(~mask.unsqueeze(2), 0.0)
+        # compute the mean across the non-padding dimensions
+        mean_embs = masked_embs.sum(dim) / original_lens.view(-1, 1).float()
+    elif embs.dim() == 2:
+        masked_embs = embs.masked_fill(~mask, 0.0)
+        mean_embs = masked_embs.sum(dim) / original_lens.float()
+    return mean_embs
+# get cell embeddings when there is no padding
+def compute_nonpadded_cell_embedding(embs, cell_emb_style):
+    if cell_emb_style == "mean_pool":
+        return torch.mean(embs, dim=embs.ndim - 2)
+# quantify shifts for a set of genes
+def quant_cos_sims(
+    perturbation_emb,
+    original_emb,
+    cell_states_to_model,
+    state_embs_dict,
+    emb_mode="gene",
+):
+    if emb_mode == "gene":
+        cos = torch.nn.CosineSimilarity(dim=2)
+    elif emb_mode == "cell":
+        cos = torch.nn.CosineSimilarity(dim=1)
+    if cell_states_to_model is None:
+        cos_sims = cos(perturbation_emb, original_emb).to("cuda")
+    else:
+        possible_states = get_possible_states(cell_states_to_model)
+        cos_sims = dict(zip(possible_states, [[] for _ in range(len(possible_states))]))
+        for state in possible_states:
+            cos_sims[state] = cos_sim_shift(
+                original_emb,
+                perturbation_emb,
+                state_embs_dict[state].to("cuda"),  # required to move to cuda here
+                cos,
+            )
+    return cos_sims
+# calculate cos sim shift of perturbation with respect to origin and alternative cell
+def cos_sim_shift(original_emb, perturbed_emb, end_emb, cos):
+    origin_v_end = cos(original_emb, end_emb)
+    perturb_v_end = cos(perturbed_emb, end_emb)
+    return perturb_v_end - origin_v_end
+def concatenate_cos_sims(cos_sims):
+    if isinstance(cos_sims, list):
+        return torch.cat(cos_sims)
+    else:
+        for state in cos_sims.keys():
+            cos_sims[state] = torch.cat(cos_sims[state])
+        return cos_sims
+def write_perturbation_dictionary(cos_sims_dict: defaultdict, output_path_prefix: str):
+    with open(f"{output_path_prefix}_raw.pickle", "wb") as fp:
+        pickle.dump(cos_sims_dict, fp)
+def tensor_list_to_pd(tensor_list):
+    tensor = torch.cat(tensor_list).cpu().numpy()
+    df = pd.DataFrame(tensor)
+    return df
+def validate_cell_states_to_model(cell_states_to_model):
+    if cell_states_to_model is not None:
+        if len(cell_states_to_model.items()) == 1:
+            logger.warning(
+                "The single value dictionary for cell_states_to_model will be "
+                "replaced with a dictionary with named keys for start, goal, and alternate states. "
+                "Please specify state_key, start_state, goal_state, and alt_states "
+                "in the cell_states_to_model dictionary for future use. "
+                "For example, cell_states_to_model={"
+                "'state_key': 'disease', "
+                "'start_state': 'dcm', "
+                "'goal_state': 'nf', "
+                "'alt_states': ['hcm', 'other1', 'other2']}"
+            )
+            for key, value in cell_states_to_model.items():
+                if (len(value) == 3) and isinstance(value, tuple):
+                    if (
+                        isinstance(value[0], list)
+                        and isinstance(value[1], list)
+                        and isinstance(value[2], list)
+                    ):
+                        if len(value[0]) == 1 and len(value[1]) == 1:
+                            all_values = value[0] + value[1] + value[2]
+                            if len(all_values) == len(set(all_values)):
+                                continue
+            # reformat to the new named key format
+            state_values = flatten_list(list(cell_states_to_model.values()))
+            cell_states_to_model = {
+                "state_key": list(cell_states_to_model.keys())[0],
+                "start_state": state_values[0][0],
+                "goal_state": state_values[1][0],
+                "alt_states": state_values[2:][0],
+            }
+        elif set(cell_states_to_model.keys()).issuperset(
+            {"state_key", "start_state", "goal_state"}
+        ):
+            if (
+                (cell_states_to_model["state_key"] is None)
+                or (cell_states_to_model["start_state"] is None)
+                or (cell_states_to_model["goal_state"] is None)
+            ):
+                logger.error(
+                    "Please specify 'state_key', 'start_state', and 'goal_state' in cell_states_to_model."
+                )
+                raise
+            if (
+                cell_states_to_model["start_state"]
+                == cell_states_to_model["goal_state"]
+            ):
+                logger.error("All states must be unique.")
+                raise
+            if "alt_states" in set(cell_states_to_model.keys()):
+                if cell_states_to_model["alt_states"] is not None:
+                    if not isinstance(cell_states_to_model["alt_states"], list):
+                        logger.error(
+                            "cell_states_to_model['alt_states'] must be a list (even if it is one element)."
+                        )
+                        raise
+                    if len(cell_states_to_model["alt_states"]) != len(
+                        set(cell_states_to_model["alt_states"])
+                    ):
+                        logger.error("All states must be unique.")
+                        raise
+            else:
+                cell_states_to_model["alt_states"] = []
+        else:
+            logger.error(
+                "cell_states_to_model must only have the following four keys: "
+                "'state_key', 'start_state', 'goal_state', 'alt_states'."
+                "For example, cell_states_to_model={"
+                "'state_key': 'disease', "
+                "'start_state': 'dcm', "
+                "'goal_state': 'nf', "
+                "'alt_states': ['hcm', 'other1', 'other2']}"
+            )
+            raise

setup.py CHANGED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup
 setup(
     name="geneformer",
-    version="0.0.1",
     author="Christina Theodoris",
     author_email="christina.theodoris@gladstone.ucsf.edu",
     description="Geneformer is a transformer model pretrained \

 setup(
     name="geneformer",
+    version="0.1.0",
     author="Christina Theodoris",
     author_email="christina.theodoris@gladstone.ucsf.edu",
     description="Geneformer is a transformer model pretrained \