issue with single gene perturbation
#358
by
cstrlln
- opened
I'm getting this error when trying to do in silico perturb with a single gene:
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
here is my code
# first obtain start, goal, and alt embedding positions
# this function was changed to be separate from perturb_data
# to avoid repeating calcuations when parallelizing perturb_data
cell_states_to_model={"state_key": "cell_type",
"start_state": "Mid",
"goal_state": "Late_3",
"alt_states": ["Late_1","Late_2","Late_4"]}
filter_data_dict={"cell_type":["Mid","Late_3","Late_1","Late_2","Late_4"]}
# embex = EmbExtractor(model_type="CellClassifier",
# num_classes=3,
# filter_data=filter_data_dict,
# max_ncells=1000,
# emb_layer=0,
# summary_stat="exact_mean",
# forward_batch_size=32,
# nproc=16)
embex = EmbExtractor(model_type="CellClassifier",
num_classes=10,
max_ncells=1000,
emb_layer=0,
summary_stat="exact_mean",
forward_batch_size=10,
nproc=8)
state_embs_dict = embex.get_state_embs(cell_states_to_model,
"classifier/240625155408/240625_geneformer_cellClassifier_asc_classifier_test/ksplit1",
"data_for_geneformer/asc_organs.dataset",
output_directory = output_dir,
output_prefix = output_prefix)
isp = InSilicoPerturber(perturb_type="overexpress",
perturb_rank_shift=None,
genes_to_perturb= ['ENSG00000171791'],
combos=0,
anchor_gene=None,
model_type="CellClassifier",
num_classes=10,
emb_mode="cell",
cell_emb_style="mean_pool",
cell_states_to_model=cell_states_to_model,
state_embs_dict=state_embs_dict,
max_ncells=2000,
emb_layer=0,
forward_batch_size=4,
nproc=1)
isp.perturb_data("classifier/240625155408/240625_geneformer_cellClassifier_asc_classifier_test/ksplit1",
"data_for_geneformer/asc_organs.dataset",
output_directory = output_dir,
output_prefix = "pert")
ispstats = InSilicoPerturberStats(mode="goal_state_shift",
genes_perturbed=genes,
combos=0,
anchor_gene=None,
cell_states_to_model=cell_states_to_model)
ispstats.get_stats(input_data_directory = output_dir,
null_dist_data_directory = None,
output_directory = output_dir,
output_prefix = "stats_bcl2_over2")
here is full error, fails only in last step:
100%
β4/4β[00:00<00:00,β437.56it/s]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[30], line 2
1 # extracts data from intermediate files and processes stats to output in final .csv
----> 2 ispstats.get_stats(input_data_directory = output_dir,
3 null_dist_data_directory = None,
4 output_directory = output_dir,
5 output_prefix = "stats_bcl2_over2")
File ~/miniforge3/envs/geneformer/lib/python3.10/site-packages/geneformer/in_silico_perturber_stats.py:975, in InSilicoPerturberStats.get_stats(self, input_data_directory, null_dist_data_directory, output_directory, output_prefix, null_dict_list)
966 else:
967 # cos sim data for effect of gene perturbation on the embedding of each cell
968 dict_list = read_dictionaries(
969 input_data_directory,
970 "cell",
(...)
973 self.pickle_suffix,
974 )
--> 975 gene_list = get_gene_list(dict_list, "cell")
977 # initiate results dataframe
978 cos_sims_df_initial = pd.DataFrame(
979 {
980 "Gene": gene_list,
(...)
991 index=[i for i in range(len(gene_list))],
992 )
File ~/miniforge3/envs/geneformer/lib/python3.10/site-packages/geneformer/in_silico_perturber_stats.py:153, in get_gene_list(dict_list, mode)
151 if mode == "gene":
152 gene_list.remove("cell_emb")
--> 153 gene_list.sort()
154 return gene_list
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()