Fix isp perturb_group dims, reformat cell states dict to keyed, add attn mask

Browse files

Files changed (5) hide show

examples/extract_and_plot_cell_embeddings.ipynb +0 -0
examples/in_silico_perturbation.ipynb +4 -1
geneformer/emb_extractor.py +8 -22
geneformer/in_silico_perturber.py +114 -67
geneformer/in_silico_perturber_stats.py +80 -17

examples/extract_and_plot_cell_embeddings.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

examples/in_silico_perturbation.ipynb CHANGED Viewed

@@ -33,7 +33,10 @@
     "                        emb_mode=\"cell\",\n",
     "                        cell_emb_style=\"mean_pool\",\n",
     "                        filter_data={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]},\n",
-    "                        cell_states_to_model={\"disease\":([\"dcm\"],[\"nf\"],[\"hcm\"])},\n",
     "                        max_ncells=2000,\n",
     "                        emb_layer=0,\n",
     "                        forward_batch_size=400,\n",

     "                        emb_mode=\"cell\",\n",
     "                        cell_emb_style=\"mean_pool\",\n",
     "                        filter_data={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]},\n",
+    "                        cell_states_to_model={'state_key': 'disease', \n",
+    "                                              'start_state': 'dcm', \n",
+    "                                              'goal_state': 'nf', \n",
+    "                                              'alt_states': ['hcm']},\n",
     "                        max_ncells=2000,\n",
     "                        emb_layer=0,\n",
     "                        forward_batch_size=400,\n",

geneformer/emb_extractor.py CHANGED Viewed

@@ -43,32 +43,17 @@ from transformers import BertForMaskedLM, BertForTokenClassification, BertForSeq
 from .tokenizer import TOKEN_DICTIONARY_FILE
-from .in_silico_perturber import load_and_filter, \
-                                 downsample_and_sort, \
                                  load_model, \
-                                 quant_layers, \
-                                 downsample_and_sort, \
                                  pad_tensor_list, \
-                                 get_model_input_size
 logger = logging.getLogger(__name__)
-# get cell embeddings excluding padding
-def mean_nonpadding_embs(embs, original_lens):
-    # mask based on padding lengths
-    mask = torch.arange(embs.size(1)).unsqueeze(0).to("cuda") < original_lens.unsqueeze(1)
-    # extend mask dimensions to match the embeddings tensor
-    mask = mask.unsqueeze(2).expand_as(embs)
-    # use the mask to zero out the embeddings in padded areas
-    masked_embs = embs * mask.float()
-    # sum and divide by the lengths to get the mean of non-padding embs
-    mean_embs = masked_embs.sum(1) / original_lens.view(-1, 1).float()
-    return mean_embs
 # average embedding position of goal cell states
 def get_embs(model,
              filtered_input_data,
@@ -99,7 +84,8 @@ def get_embs(model,
         with torch.no_grad():
             outputs = model(
-                input_ids = input_data_minibatch.to("cuda")
             )
         embs_i = outputs.hidden_states[layer_to_quant]

 from .tokenizer import TOKEN_DICTIONARY_FILE
+from .in_silico_perturber import downsample_and_sort, \
+                                 gen_attention_mask, \
+                                 get_model_input_size, \
+                                 load_and_filter, \
                                  load_model, \
+                                 mean_nonpadding_embs, \
                                  pad_tensor_list, \
+                                 quant_layers
 logger = logging.getLogger(__name__)
 # average embedding position of goal cell states
 def get_embs(model,
              filtered_input_data,
         with torch.no_grad():
             outputs = model(
+                input_ids = input_data_minibatch.to("cuda"),
+                attention_mask = gen_attention_mask(minibatch)
             )
         embs_i = outputs.hidden_states[layer_to_quant]

geneformer/in_silico_perturber.py CHANGED Viewed

@@ -13,7 +13,7 @@ Usage:
                           emb_mode="cell",
                           cell_emb_style="mean_pool",
                           filter_data={"cell_type":["cardiomyocyte"]},
-                          cell_states_to_model={"disease":(["dcm"],["ctrl"],["hcm"])},
                           max_ncells=None,
                           emb_layer=-1,
                           forward_batch_size=100,
@@ -106,10 +106,11 @@ def downsample_and_sort(data_shuffled, max_ncells):
     return data_sorted
 def get_possible_states(cell_states_to_model):
-    if list(cell_states_to_model.values())[3] is not None:
-        return list(cell_states_to_model.values())[1:3] + list(cell_states_to_model.values())[3]
-    else:
-        return list(cell_states_to_model.values())[1:3]
 def forward_pass_single_cell(model, example_cell, layer_to_quant):
     example_cell.set_format(type="torch")
@@ -152,6 +153,21 @@ def overexpress_tokens(example):
     [example["input_ids"].insert(0, token) for token in example["tokens_to_perturb"][::-1]]
     return example
 def make_perturbation_batch(example_cell,
                             perturb_type,
                             tokens_to_perturb,
@@ -249,7 +265,7 @@ def get_cell_state_avg_embs(model,
         def filter_states(example):
             state_key = cell_states_to_model["state_key"]
-            return example[state_key] in possible_state
         filtered_input_data_state = filtered_input_data.filter(filter_states, num_proc=num_proc)
         total_batch_length = len(filtered_input_data_state)
         if ((total_batch_length-1)/forward_batch_size).is_integer():
@@ -262,15 +278,17 @@ def get_cell_state_avg_embs(model,
             state_minibatch.set_format(type="torch")
             input_data_minibatch = state_minibatch["input_ids"]
-            original_lens += [tensor.numel() for tensor in input_data_minibatch]
             input_data_minibatch = pad_tensor_list(input_data_minibatch,
                                                    max_len,
                                                    pad_token_id,
                                                    model_input_size)
             with torch.no_grad():
                 outputs = model(
-                    input_ids = input_data_minibatch.to("cuda")
                 )
             state_embs_i = outputs.hidden_states[layer_to_quant]
@@ -278,11 +296,10 @@ def get_cell_state_avg_embs(model,
             del outputs
             del state_minibatch
             del input_data_minibatch
             del state_embs_i
             torch.cuda.empty_cache()
-        # import here to avoid circular imports
-        from .emb_extractor import mean_nonpadding_embs
         state_embs = torch.cat(state_embs_list)
         avg_state_emb = mean_nonpadding_embs(state_embs, torch.Tensor(original_lens).to("cuda"))
         avg_state_emb = torch.mean(avg_state_emb, dim=0, keepdim=True)
@@ -333,6 +350,7 @@ def quant_cos_sims(model,
             needs_pad_or_trunc = True
         else:
             needs_pad_or_trunc = False
         if needs_pad_or_trunc == True:
             max_len = min(max(minibatch_length_set),model_input_size)
@@ -345,14 +363,17 @@ def quant_cos_sims(model,
         perturbation_minibatch.set_format(type="torch")
         input_data_minibatch = perturbation_minibatch["input_ids"]
         # extract embeddings for perturbation minibatch
         with torch.no_grad():
             outputs = model(
-                input_ids = input_data_minibatch.to("cuda")
             )
         del input_data_minibatch
         del perturbation_minibatch
         if len(indices_to_perturb)>1:
             minibatch_emb = torch.squeeze(outputs.hidden_states[layer_to_quant])
@@ -387,43 +408,29 @@ def quant_cos_sims(model,
                 original_minibatch = original_minibatch.map(pad_or_trunc_example, num_proc=nproc)
             original_minibatch.set_format(type="torch")
             original_input_data_minibatch = original_minibatch["input_ids"]
             # extract embeddings for original minibatch
             with torch.no_grad():
                 original_outputs = model(
-                    input_ids = original_input_data_minibatch.to("cuda")
                 )
             del original_input_data_minibatch
             del original_minibatch
             if len(indices_to_perturb)>1:
                 original_minibatch_emb = torch.squeeze(original_outputs.hidden_states[layer_to_quant])
             else:
                 original_minibatch_emb = original_outputs.hidden_states[layer_to_quant]
-            # remove perturbed index before calculating the cos sims
-            def remove_indices_from_emb(emb, indices_to_remove, gene_dim):
-                # indices_to_remove is list of indices to remove
-                gene_dim -= 1  # removing a dim in calling the function
-                indices_to_keep = [i for i in range(emb.size()[gene_dim]) if i not in indices_to_remove]
-                num_dims = emb.dim()
-                emb_slice = [slice(None) if dim != gene_dim else indices_to_keep for dim in range(num_dims)]
-                sliced_emb = emb[emb_slice]
-                return sliced_emb
-            # this could probably be optimized
             gene_dim = 1
-            # current there's the case if a gene is not expressed and is being overexpressed,
-            # the dimensions will be thrown off --> not removing indices to get around that issue
-            # not sure what's the best way to handle it
             if perturb_type != "overexpress":
-                original_minibatch_emb = torch.stack([
-                    remove_indices_from_emb(original_minibatch_emb[i, :, :], idx, gene_dim) for
-                    i, idx in enumerate(indices_to_perturb)
-                ])
-            # do the averaging here
         # cosine similarity between original emb and batch items
         if cell_states_to_model is None:
@@ -433,6 +440,7 @@ def quant_cos_sims(model,
                 minibatch_comparison = make_comparison_batch(original_minibatch_emb,
                                                              indices_to_perturb,
                                                              perturb_group)
             cos_sims += [cos(minibatch_emb, minibatch_comparison).to("cpu")]
         elif cell_states_to_model is not None:
             for state in possible_states:
@@ -462,12 +470,17 @@ def quant_cos_sims(model,
         return cos_sims_vs_alt_dict
 # calculate cos sim shift of perturbation with respect to origin and alternative cell
-def cos_sim_shift(original_emb, minibatch_emb, alt_emb, perturb_group, original_minibatch_lengths = None, minibatch_lengths = None,):
     cos = torch.nn.CosineSimilarity(dim=2)
     if not perturb_group:
         original_emb = torch.mean(original_emb,dim=0,keepdim=True)
         original_emb = original_emb[None, :]
-        origin_v_end = torch.squeeze(cos(original_emb, alt_emb))
     else:
         if original_emb.size() != minibatch_emb.size():
             logger.error(
@@ -476,26 +489,22 @@ def cos_sim_shift(original_emb, minibatch_emb, alt_emb, perturb_group, original_
                 f"minibatch_emb is {minibatch_emb.size()}. "
             )
             raise
-        from .emb_extractor import mean_nonpadding_embs
         if original_minibatch_lengths is not None:
             original_emb = mean_nonpadding_embs(original_emb, original_minibatch_lengths)
-            # not sure if the else is necessary, but keeping it here in case
-        else:
-            original_emb = torch.mean(original_emb,dim=1,keepdim=True)
-        alt_emb = torch.unsqueeze(alt_emb, 1)
-        origin_v_end = cos(original_emb, alt_emb)
         origin_v_end = torch.squeeze(origin_v_end)
     if minibatch_lengths is not None:
         perturb_emb = mean_nonpadding_embs(minibatch_emb, minibatch_lengths)
     else:
         perturb_emb = torch.mean(minibatch_emb,dim=1,keepdim=True)
-    perturb_v_end = cos(perturb_emb, alt_emb)
     perturb_v_end = torch.squeeze(perturb_v_end)
     return [(perturb_v_end-origin_v_end).to("cpu")]
 def pad_list(input_ids, pad_token_id, max_len):
@@ -555,6 +564,30 @@ def pad_tensor_list(tensor_list, dynamic_or_constant, pad_token_id, model_input_
     # return stacked tensors
     return torch.stack(tensor_list)
 class InSilicoPerturber:
     valid_option_dict = {
         "perturb_type": {"delete","overexpress","inhibit","activate"},
@@ -640,9 +673,15 @@ class InSilicoPerturber:
             Otherwise, dictionary specifying .dataset column name and list of values to filter by.
         cell_states_to_model: None, dict
             Cell states to model if testing perturbations that achieve goal state change.
-            Single-item dictionary with key being cell attribute (e.g. "disease").
-            Value is tuple of three lists indicating start state, goal end state, and alternate possible end states.
-            If no alternate possible end states, third list should be empty (i.e. the third list should be []).
         max_ncells : None, int
             Maximum number of cells to test.
             If None, will test all cells.
@@ -775,9 +814,14 @@ class InSilicoPerturber:
             if len(self.cell_states_to_model.items()) == 1:
                 logger.warning(
                     "The single value dictionary for cell_states_to_model will be " \
-                    "replaced with explicitly modeling start and end states. " \
-                    "Please specify state_key, start_state, end_state, and alt_states " \
-                    "in the cell_states_to_model dictionary for future use."
                 )
                 for key,value in self.cell_states_to_model.items():
                     if (len(value) == 3) and isinstance(value, tuple):
@@ -786,7 +830,7 @@ class InSilicoPerturber:
                                 all_values = value[0]+value[1]+value[2]
                                 if len(all_values) == len(set(all_values)):
                                     continue
-                # reformat to the new format
                 state_values = flatten_list(list(self.cell_states_to_model.values()))
                 self.cell_states_to_model = {
                     "state_key": list(self.cell_states_to_model.keys())[0],
@@ -795,11 +839,13 @@ class InSilicoPerturber:
                     "alt_states": state_values[2:][0]
                 }
             elif set(self.cell_states_to_model.keys()) == {"state_key", "start_state", "goal_state", "alt_states"}:
-                if self.cell_states_to_model["start_state"] is None or self.cell_states_to_model["goal_state"] is None:
                     logger.error(
-                        "Please specify 'start_state' and 'goal_state' in cell_states_to_model.")
                     raise
                 if self.cell_states_to_model["start_state"] == self.cell_states_to_model["goal_state"]:
                     logger.error(
                         "All states must be unique.")
@@ -818,13 +864,13 @@ class InSilicoPerturber:
             else:
                 logger.error(
-                    "states_to_model must only have the following four keys: 'state_key', 'start_state', 'goal_state', 'alt_states'." \
-                    "For example, cell_states_to_model={ \
-                            'state_key': 'disease', \
-                            'start_state': 'dcm', \
-                            'goal_state': 'nf'', \
-                            'alt_states': ['hcm', 'other1', 'other2'] \
-                          }"
                 )
                 raise
@@ -877,12 +923,13 @@ class InSilicoPerturber:
         if self.cell_states_to_model is None:
             state_embs_dict = None
         else:
-            # make sure that all states are valid; save time on filtering
             state_name = self.cell_states_to_model["state_key"]
             for value in get_possible_states(self.cell_states_to_model):
-                if value not in filtered_input_data[state_name]:
                     logger.error(
-                        f"{value} is not a valid value in {state_name}.")
                     raise
             # get dictionary of average cell state embeddings for comparison
             downsampled_data = downsample_and_sort(filtered_input_data, self.max_ncells)
@@ -1019,7 +1066,7 @@ class InSilicoPerturber:
                     data_list = []
                     for data in list(cos_sims_data.values()):
                         data_item = data.to("cuda")
-                        data_list += [data_item]
                     cos_sims_dict[(perturbed_genes, "cell_emb")] += [tuple(data_list)]
             with open(f"{output_path_prefix}_raw.pickle", "wb") as fp:
@@ -1213,4 +1260,4 @@ class InSilicoPerturber:
             # save remainder cells
             with open(f"{output_path_prefix}{pickle_batch}_raw.pickle", "wb") as fp:
-                pickle.dump(cos_sims_dict, fp)

                           emb_mode="cell",
                           cell_emb_style="mean_pool",
                           filter_data={"cell_type":["cardiomyocyte"]},
+                          cell_states_to_model={"state_key": "disease", "start_state": "dcm", "goal_state": "nf", "alt_states": ["hcm", "other1", "other2"]},
                           max_ncells=None,
                           emb_layer=-1,
                           forward_batch_size=100,
     return data_sorted
 def get_possible_states(cell_states_to_model):
+    possible_states = []
+    for key in ["start_state","goal_state"]:
+        possible_states += [cell_states_to_model[key]]
+    possible_states += cell_states_to_model.get("alt_states",[])
+    return possible_states
 def forward_pass_single_cell(model, example_cell, layer_to_quant):
     example_cell.set_format(type="torch")
     [example["input_ids"].insert(0, token) for token in example["tokens_to_perturb"][::-1]]
     return example
+def remove_indices_from_emb(emb, indices_to_remove, gene_dim):
+    # indices_to_remove is list of indices to remove
+    indices_to_keep = [i for i in range(emb.size()[gene_dim]) if i not in indices_to_remove]
+    num_dims = emb.dim()
+    emb_slice = [slice(None) if dim != gene_dim else indices_to_keep for dim in range(num_dims)]
+    sliced_emb = emb[emb_slice]
+    return sliced_emb
+def remove_indices_from_emb_batch(emb_batch, list_of_indices_to_remove, gene_dim):
+    output_batch = torch.stack([
+                    remove_indices_from_emb(emb_batch[i, :, :], idx, gene_dim-1) for
+                    i, idx in enumerate(list_of_indices_to_remove)
+                    ])
+    return output_batch
 def make_perturbation_batch(example_cell,
                             perturb_type,
                             tokens_to_perturb,
         def filter_states(example):
             state_key = cell_states_to_model["state_key"]
+            return example[state_key] in [possible_state]
         filtered_input_data_state = filtered_input_data.filter(filter_states, num_proc=num_proc)
         total_batch_length = len(filtered_input_data_state)
         if ((total_batch_length-1)/forward_batch_size).is_integer():
             state_minibatch.set_format(type="torch")
             input_data_minibatch = state_minibatch["input_ids"]
+            original_lens += state_minibatch["length"]
             input_data_minibatch = pad_tensor_list(input_data_minibatch,
                                                    max_len,
                                                    pad_token_id,
                                                    model_input_size)
+            attention_mask = gen_attention_mask(state_minibatch, max_len)
             with torch.no_grad():
                 outputs = model(
+                    input_ids = input_data_minibatch.to("cuda"),
+                    attention_mask = attention_mask
                 )
             state_embs_i = outputs.hidden_states[layer_to_quant]
             del outputs
             del state_minibatch
             del input_data_minibatch
+            del attention_mask
             del state_embs_i
             torch.cuda.empty_cache()
         state_embs = torch.cat(state_embs_list)
         avg_state_emb = mean_nonpadding_embs(state_embs, torch.Tensor(original_lens).to("cuda"))
         avg_state_emb = torch.mean(avg_state_emb, dim=0, keepdim=True)
             needs_pad_or_trunc = True
         else:
             needs_pad_or_trunc = False
+            max_len = max(minibatch_length_set)
         if needs_pad_or_trunc == True:
             max_len = min(max(minibatch_length_set),model_input_size)
         perturbation_minibatch.set_format(type="torch")
         input_data_minibatch = perturbation_minibatch["input_ids"]
+        attention_mask = gen_attention_mask(perturbation_minibatch, max_len)
         # extract embeddings for perturbation minibatch
         with torch.no_grad():
             outputs = model(
+                input_ids = input_data_minibatch.to("cuda"),
+                attention_mask = attention_mask
             )
         del input_data_minibatch
         del perturbation_minibatch
+        del attention_mask
         if len(indices_to_perturb)>1:
             minibatch_emb = torch.squeeze(outputs.hidden_states[layer_to_quant])
                 original_minibatch = original_minibatch.map(pad_or_trunc_example, num_proc=nproc)
             original_minibatch.set_format(type="torch")
             original_input_data_minibatch = original_minibatch["input_ids"]
+            attention_mask = gen_attention_mask(original_minibatch, original_max_len)
             # extract embeddings for original minibatch
             with torch.no_grad():
                 original_outputs = model(
+                    input_ids = original_input_data_minibatch.to("cuda"),
+                    attention_mask = attention_mask
                 )
             del original_input_data_minibatch
             del original_minibatch
+            del attention_mask
             if len(indices_to_perturb)>1:
                 original_minibatch_emb = torch.squeeze(original_outputs.hidden_states[layer_to_quant])
             else:
                 original_minibatch_emb = original_outputs.hidden_states[layer_to_quant]
+            # embedding dimension of the genes
             gene_dim = 1
+            # exclude overexpression due to case when genes are not expressed but being overexpressed
             if perturb_type != "overexpress":
+                original_minibatch_emb = remove_indices_from_emb_batch(original_minibatch_emb,
+                                                                       indices_to_perturb,
+                                                                       gene_dim)
         # cosine similarity between original emb and batch items
         if cell_states_to_model is None:
                 minibatch_comparison = make_comparison_batch(original_minibatch_emb,
                                                              indices_to_perturb,
                                                              perturb_group)
             cos_sims += [cos(minibatch_emb, minibatch_comparison).to("cpu")]
         elif cell_states_to_model is not None:
             for state in possible_states:
         return cos_sims_vs_alt_dict
 # calculate cos sim shift of perturbation with respect to origin and alternative cell
+def cos_sim_shift(original_emb,
+                  minibatch_emb,
+                  end_emb,
+                  perturb_group,
+                  original_minibatch_lengths = None,
+                  minibatch_lengths = None):
     cos = torch.nn.CosineSimilarity(dim=2)
     if not perturb_group:
         original_emb = torch.mean(original_emb,dim=0,keepdim=True)
         original_emb = original_emb[None, :]
+        origin_v_end = torch.squeeze(cos(original_emb, end_emb)) #test
     else:
         if original_emb.size() != minibatch_emb.size():
             logger.error(
                 f"minibatch_emb is {minibatch_emb.size()}. "
             )
             raise
         if original_minibatch_lengths is not None:
             original_emb = mean_nonpadding_embs(original_emb, original_minibatch_lengths)
+        # else:
+        #     original_emb = torch.mean(original_emb,dim=1,keepdim=True)
+        end_emb = torch.unsqueeze(end_emb, 1)
+        origin_v_end = cos(original_emb, end_emb)
         origin_v_end = torch.squeeze(origin_v_end)
     if minibatch_lengths is not None:
         perturb_emb = mean_nonpadding_embs(minibatch_emb, minibatch_lengths)
     else:
         perturb_emb = torch.mean(minibatch_emb,dim=1,keepdim=True)
+    perturb_v_end = cos(perturb_emb, end_emb)
     perturb_v_end = torch.squeeze(perturb_v_end)
     return [(perturb_v_end-origin_v_end).to("cpu")]
 def pad_list(input_ids, pad_token_id, max_len):
     # return stacked tensors
     return torch.stack(tensor_list)
+def gen_attention_mask(minibatch_encoding, max_len = None):
+    if max_len == None:
+        max_len = max(minibatch_encoding["length"])
+    original_lens = minibatch_encoding["length"]
+    attention_mask = [[1]*original_len
+                      +[0]*(max_len - original_len)
+                      for original_len in original_lens]
+    return torch.tensor(attention_mask).to("cuda")
+# get cell embeddings excluding padding
+def mean_nonpadding_embs(embs, original_lens):
+    # mask based on padding lengths
+    mask = torch.arange(embs.size(1)).unsqueeze(0).to("cuda") < original_lens.unsqueeze(1)
+    # extend mask dimensions to match the embeddings tensor
+    mask = mask.unsqueeze(2).expand_as(embs)
+    # use the mask to zero out the embeddings in padded areas
+    masked_embs = embs * mask.float()
+    # sum and divide by the lengths to get the mean of non-padding embs
+    mean_embs = masked_embs.sum(1) / original_lens.view(-1, 1).float()
+    return mean_embs
 class InSilicoPerturber:
     valid_option_dict = {
         "perturb_type": {"delete","overexpress","inhibit","activate"},
             Otherwise, dictionary specifying .dataset column name and list of values to filter by.
         cell_states_to_model: None, dict
             Cell states to model if testing perturbations that achieve goal state change.
+            Four-item dictionary with keys: state_key, start_state, goal_state, and alt_states
+            state_key: key specifying name of column in .dataset that defines the start/goal states
+            start_state: value in the state_key column that specifies the start state
+            goal_state: value in the state_key column taht specifies the goal end state
+            alt_states: list of values in the state_key column that specify the alternate end states
+            For example: {"state_key": "disease",
+                          "start_state": "dcm",
+                          "goal_state": "nf",
+                          "alt_states": ["hcm", "other1", "other2"]}
         max_ncells : None, int
             Maximum number of cells to test.
             If None, will test all cells.
             if len(self.cell_states_to_model.items()) == 1:
                 logger.warning(
                     "The single value dictionary for cell_states_to_model will be " \
+                    "replaced with a dictionary with named keys for start, goal, and alternate states. " \
+                    "Please specify state_key, start_state, goal_state, and alt_states " \
+                    "in the cell_states_to_model dictionary for future use. " \
+                    "For example, cell_states_to_model={" \
+                            "'state_key': 'disease', " \
+                            "'start_state': 'dcm', " \
+                            "'goal_state': 'nf', " \
+                            "'alt_states': ['hcm', 'other1', 'other2']}"
                 )
                 for key,value in self.cell_states_to_model.items():
                     if (len(value) == 3) and isinstance(value, tuple):
                                 all_values = value[0]+value[1]+value[2]
                                 if len(all_values) == len(set(all_values)):
                                     continue
+                # reformat to the new named key format
                 state_values = flatten_list(list(self.cell_states_to_model.values()))
                 self.cell_states_to_model = {
                     "state_key": list(self.cell_states_to_model.keys())[0],
                     "alt_states": state_values[2:][0]
                 }
             elif set(self.cell_states_to_model.keys()) == {"state_key", "start_state", "goal_state", "alt_states"}:
+                if (self.cell_states_to_model["state_key"] is None) \
+                    or (self.cell_states_to_model["start_state"] is None) \
+                    or (self.cell_states_to_model["goal_state"] is None):
                     logger.error(
+                        "Please specify 'state_key', 'start_state', and 'goal_state' in cell_states_to_model.")
                     raise
                 if self.cell_states_to_model["start_state"] == self.cell_states_to_model["goal_state"]:
                     logger.error(
                         "All states must be unique.")
             else:
                 logger.error(
+                    "cell_states_to_model must only have the following four keys: " \
+                    "'state_key', 'start_state', 'goal_state', 'alt_states'." \
+                    "For example, cell_states_to_model={" \
+                            "'state_key': 'disease', " \
+                            "'start_state': 'dcm', " \
+                            "'goal_state': 'nf', " \
+                            "'alt_states': ['hcm', 'other1', 'other2']}"
                 )
                 raise
         if self.cell_states_to_model is None:
             state_embs_dict = None
         else:
+            # confirm that all states are valid to prevent futile filtering
             state_name = self.cell_states_to_model["state_key"]
+            state_values = filtered_input_data[state_name]
             for value in get_possible_states(self.cell_states_to_model):
+                if value not in state_values:
                     logger.error(
+                        f"{value} is not present in the dataset's {state_name} attribute.")
                     raise
             # get dictionary of average cell state embeddings for comparison
             downsampled_data = downsample_and_sort(filtered_input_data, self.max_ncells)
                     data_list = []
                     for data in list(cos_sims_data.values()):
                         data_item = data.to("cuda")
+                        data_list += [data_item[j].item()]
                     cos_sims_dict[(perturbed_genes, "cell_emb")] += [tuple(data_list)]
             with open(f"{output_path_prefix}_raw.pickle", "wb") as fp:
             # save remainder cells
             with open(f"{output_path_prefix}{pickle_batch}_raw.pickle", "wb") as fp:
+                pickle.dump(cos_sims_dict, fp)

geneformer/in_silico_perturber_stats.py CHANGED Viewed

@@ -6,7 +6,10 @@ Usage:
   ispstats = InSilicoPerturberStats(mode="goal_state_shift",
                                     combos=0,
                                     anchor_gene=None,
-                                    cell_states_to_model={"disease":(["dcm"],["ctrl"],["hcm"])})
   ispstats.get_stats("path/to/input_data",
                      None,
                      "path/to/output_directory",
@@ -26,6 +29,8 @@ from scipy.stats import ranksums
 from sklearn.mixture import GaussianMixture
 from tqdm.notebook import trange, tqdm
 from .tokenizer import TOKEN_DICTIONARY_FILE
 GENE_NAME_ID_DICTIONARY_FILE = Path(__file__).parent / "gene_name_id_dict.pkl"
@@ -123,10 +128,10 @@ def isp_aggregate_grouped_perturb(cos_sims_df, dict_list):
 # stats comparing cos sim shifts towards goal state of test perturbations vs random perturbations
 def isp_stats_to_goal_state(cos_sims_df, dict_list, cell_states_to_model, genes_perturbed):
-    cell_state_key = list(cell_states_to_model.keys())[0]
-    if cell_states_to_model[cell_state_key][2] == []:
         alt_end_state_exists = False
-    elif (len(cell_states_to_model[cell_state_key][2]) > 0) and (cell_states_to_model[cell_state_key][2] != [None]):
         alt_end_state_exists = True
     # for single perturbation in multiple cells, there are no random perturbations to compare to
@@ -231,10 +236,12 @@ def isp_stats_to_goal_state(cos_sims_df, dict_list, cell_states_to_model, genes_
         # quantify number of detections of each gene
         cos_sims_full_df["N_Detections"] = [n_detections(i, dict_list, "cell", None) for i in cos_sims_full_df["Gene"]]
-        # sort by shift to desired state
-        cos_sims_full_df = cos_sims_full_df.sort_values(by=["Shift_to_goal_end",
                                                             "Goal_end_FDR"],
-                                                            ascending=[False,True])
         return cos_sims_full_df
@@ -272,9 +279,11 @@ def isp_stats_vs_null(cos_sims_df, dict_list, null_dict_list):
     cos_sims_full_df["Test_vs_null_FDR"] = get_fdr(cos_sims_full_df["Test_vs_null_pval"])
-    cos_sims_full_df = cos_sims_full_df.sort_values(by=["Test_vs_null_avg_shift",
                                                         "Test_vs_null_FDR"],
-                                                        ascending=[False,True])
     return cos_sims_full_df
 # stats for identifying perturbations with largest effect within a given set of cells
@@ -441,9 +450,15 @@ class InSilicoPerturberStats:
                 analyzes data for the effect of anchor gene's perturbation on the embedding of each other gene.
         cell_states_to_model: None, dict
             Cell states to model if testing perturbations that achieve goal state change.
-            Single-item dictionary with key being cell attribute (e.g. "disease").
-            Value is tuple of three lists indicating start state, goal end state, and alternate possible end states.
-            If no alternate possible end states, third list should be empty (i.e. the third list should be []).
         token_dictionary_file : Path
             Path to pickle file containing token dictionary (Ensembl ID:token).
         gene_name_id_dictionary_file : Path
@@ -494,6 +509,17 @@ class InSilicoPerturberStats:
         if self.cell_states_to_model is not None:
             if len(self.cell_states_to_model.items()) == 1:
                 for key,value in self.cell_states_to_model.items():
                     if (len(value) == 3) and isinstance(value, tuple):
                         if isinstance(value[0],list) and isinstance(value[1],list) and isinstance(value[2],list):
@@ -501,14 +527,50 @@ class InSilicoPerturberStats:
                                 all_values = value[0]+value[1]+value[2]
                                 if len(all_values) == len(set(all_values)):
                                     continue
             else:
                 logger.error(
-                    "Cell states to model must be a single-item dictionary with " \
-                    "key being cell attribute (e.g. 'disease') and value being " \
-                    "tuple of three lists indicating start state, goal end state, and alternate possible end states. " \
-                    "Values should all be unique. " \
-                    "For example: {'disease':(['start_state'],['ctrl'],['alt_end'])}")
                 raise
             if self.anchor_gene is not None:
                 self.anchor_gene = None
                 logger.warning(
@@ -565,6 +627,7 @@ class InSilicoPerturberStats:
         "Gene_name": gene name
         "Ensembl_ID": gene Ensembl ID
         "N_Detections": number of cells in which each gene or gene combination was detected in the input dataset
         "Shift_to_goal_end": cosine shift from start state towards goal end state in response to given perturbation
         "Shift_to_alt_end": cosine shift from start state towards alternate end state in response to given perturbation

   ispstats = InSilicoPerturberStats(mode="goal_state_shift",
                                     combos=0,
                                     anchor_gene=None,
+                                    cell_states_to_model={"state_key": "disease",
+                                                          "start_state": "dcm",
+                                                          "goal_state": "nf",
+                                                          "alt_states": ["hcm", "other1", "other2"]})
   ispstats.get_stats("path/to/input_data",
                      None,
                      "path/to/output_directory",
 from sklearn.mixture import GaussianMixture
 from tqdm.notebook import trange, tqdm
+from .in_silico_perturber import flatten_list
 from .tokenizer import TOKEN_DICTIONARY_FILE
 GENE_NAME_ID_DICTIONARY_FILE = Path(__file__).parent / "gene_name_id_dict.pkl"
 # stats comparing cos sim shifts towards goal state of test perturbations vs random perturbations
 def isp_stats_to_goal_state(cos_sims_df, dict_list, cell_states_to_model, genes_perturbed):
+    cell_state_key = cell_states_to_model["start_state"]
+    if "alt_states" not in cell_states_to_model.keys():
         alt_end_state_exists = False
+    elif (len(cell_states_to_model["alt_states"]) > 0) and (cell_states_to_model["alt_states"] != [None]):
         alt_end_state_exists = True
     # for single perturbation in multiple cells, there are no random perturbations to compare to
         # quantify number of detections of each gene
         cos_sims_full_df["N_Detections"] = [n_detections(i, dict_list, "cell", None) for i in cos_sims_full_df["Gene"]]
+        # sort by shift to desired state\
+        cos_sims_full_df["Sig"] = [1 if fdr<0.05 else 0 for fdr in cos_sims_full_df["Goal_end_FDR"]]
+        cos_sims_full_df = cos_sims_full_df.sort_values(by=["Sig",
+                                                            "Shift_to_goal_end",
                                                             "Goal_end_FDR"],
+                                                            ascending=[False,False,True])
         return cos_sims_full_df
     cos_sims_full_df["Test_vs_null_FDR"] = get_fdr(cos_sims_full_df["Test_vs_null_pval"])
+    cos_sims_full_df["Sig"] = [1 if fdr<0.05 else 0 for fdr in cos_sims_full_df["Test_vs_null_FDR"]]
+    cos_sims_full_df = cos_sims_full_df.sort_values(by=["Sig",
+                                                        "Test_vs_null_avg_shift",
                                                         "Test_vs_null_FDR"],
+                                                        ascending=[False,False,True])
     return cos_sims_full_df
 # stats for identifying perturbations with largest effect within a given set of cells
                 analyzes data for the effect of anchor gene's perturbation on the embedding of each other gene.
         cell_states_to_model: None, dict
             Cell states to model if testing perturbations that achieve goal state change.
+            Four-item dictionary with keys: state_key, start_state, goal_state, and alt_states
+            state_key: key specifying name of column in .dataset that defines the start/goal states
+            start_state: value in the state_key column that specifies the start state
+            goal_state: value in the state_key column taht specifies the goal end state
+            alt_states: list of values in the state_key column that specify the alternate end states
+            For example: {"state_key": "disease",
+                          "start_state": "dcm",
+                          "goal_state": "nf",
+                          "alt_states": ["hcm", "other1", "other2"]}
         token_dictionary_file : Path
             Path to pickle file containing token dictionary (Ensembl ID:token).
         gene_name_id_dictionary_file : Path
         if self.cell_states_to_model is not None:
             if len(self.cell_states_to_model.items()) == 1:
+                logger.warning(
+                    "The single value dictionary for cell_states_to_model will be " \
+                    "replaced with a dictionary with named keys for start, goal, and alternate states. " \
+                    "Please specify state_key, start_state, goal_state, and alt_states " \
+                    "in the cell_states_to_model dictionary for future use. " \
+                    "For example, cell_states_to_model={" \
+                            "'state_key': 'disease', " \
+                            "'start_state': 'dcm', " \
+                            "'goal_state': 'nf', " \
+                            "'alt_states': ['hcm', 'other1', 'other2']}"
+                )
                 for key,value in self.cell_states_to_model.items():
                     if (len(value) == 3) and isinstance(value, tuple):
                         if isinstance(value[0],list) and isinstance(value[1],list) and isinstance(value[2],list):
                                 all_values = value[0]+value[1]+value[2]
                                 if len(all_values) == len(set(all_values)):
                                     continue
+                # reformat to the new named key format
+                state_values = flatten_list(list(self.cell_states_to_model.values()))
+                self.cell_states_to_model = {
+                    "state_key": list(self.cell_states_to_model.keys())[0],
+                    "start_state": state_values[0][0],
+                    "goal_state": state_values[1][0],
+                    "alt_states": state_values[2:][0]
+                }
+            elif set(self.cell_states_to_model.keys()) == {"state_key", "start_state", "goal_state", "alt_states"}:
+                if (self.cell_states_to_model["state_key"] is None) \
+                    or (self.cell_states_to_model["start_state"] is None) \
+                    or (self.cell_states_to_model["goal_state"] is None):
+                    logger.error(
+                        "Please specify 'state_key', 'start_state', and 'goal_state' in cell_states_to_model.")
+                    raise
+                if self.cell_states_to_model["start_state"] == self.cell_states_to_model["goal_state"]:
+                    logger.error(
+                        "All states must be unique.")
+                    raise
+                if self.cell_states_to_model["alt_states"] is not None:
+                    if type(self.cell_states_to_model["alt_states"]) is not list:
+                        logger.error(
+                            "self.cell_states_to_model['alt_states'] must be a list (even if it is one element)."
+                        )
+                        raise
+                    if len(self.cell_states_to_model["alt_states"])!= len(set(self.cell_states_to_model["alt_states"])):
+                        logger.error(
+                            "All states must be unique.")
+                        raise
             else:
                 logger.error(
+                    "cell_states_to_model must only have the following four keys: " \
+                    "'state_key', 'start_state', 'goal_state', 'alt_states'." \
+                    "For example, cell_states_to_model={" \
+                            "'state_key': 'disease', " \
+                            "'start_state': 'dcm', " \
+                            "'goal_state': 'nf', " \
+                            "'alt_states': ['hcm', 'other1', 'other2']}"
+                )
                 raise
             if self.anchor_gene is not None:
                 self.anchor_gene = None
                 logger.warning(
         "Gene_name": gene name
         "Ensembl_ID": gene Ensembl ID
         "N_Detections": number of cells in which each gene or gene combination was detected in the input dataset
+        "Sig": 1 if FDR<0.05, otherwise 0
         "Shift_to_goal_end": cosine shift from start state towards goal end state in response to given perturbation
         "Shift_to_alt_end": cosine shift from start state towards alternate end state in response to given perturbation