hchen725 commited on
Commit
da2aaea
1 Parent(s): e2ee685

Add checks for custom attributes and n_counts prior to sum ensembl id

Browse files
Files changed (1) hide show
  1. geneformer/tokenizer.py +17 -0
geneformer/tokenizer.py CHANGED
@@ -88,6 +88,7 @@ def sum_ensembl_ids(
88
  collapse_gene_ids,
89
  gene_mapping_dict,
90
  gene_token_dict,
 
91
  file_format="loom",
92
  chunk_size=512,
93
  ):
@@ -104,6 +105,13 @@ def sum_ensembl_ids(
104
  "ensembl_id_collapsed" not in data.ra.keys()
105
  ), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
106
 
 
 
 
 
 
 
 
107
 
108
  # Get the ensembl ids that exist in data
109
  ensembl_ids = data.ra.ensembl_id
@@ -208,6 +216,13 @@ def sum_ensembl_ids(
208
  assert (
209
  "ensembl_id_collapsed" not in data.var.columns
210
  ), "'ensembl_id_collapsed' column already exists in data.var"
 
 
 
 
 
 
 
211
 
212
 
213
  # Get the ensembl ids that exist in data
@@ -461,6 +476,7 @@ class TranscriptomeTokenizer:
461
  self.collapse_gene_ids,
462
  self.gene_mapping_dict,
463
  self.gene_token_dict,
 
464
  file_format="h5ad",
465
  chunk_size=self.chunk_size,
466
  )
@@ -537,6 +553,7 @@ class TranscriptomeTokenizer:
537
  self.collapse_gene_ids,
538
  self.gene_mapping_dict,
539
  self.gene_token_dict,
 
540
  file_format="loom",
541
  chunk_size=self.chunk_size,
542
  )
 
88
  collapse_gene_ids,
89
  gene_mapping_dict,
90
  gene_token_dict,
91
+ custom_attr_name_dict,
92
  file_format="loom",
93
  chunk_size=512,
94
  ):
 
105
  "ensembl_id_collapsed" not in data.ra.keys()
106
  ), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
107
 
108
+ assert (
109
+ "n_counts" in data.ca.keys()
110
+ ), "'n_counts' column missing from data.ca.keys()"
111
+
112
+ if custom_attr_name_dict is not None:
113
+ for label in custom_attr_name_dict:
114
+ assert label in data.ca.keys(), f"Attribute `{label}` not present in dataset features"
115
 
116
  # Get the ensembl ids that exist in data
117
  ensembl_ids = data.ra.ensembl_id
 
216
  assert (
217
  "ensembl_id_collapsed" not in data.var.columns
218
  ), "'ensembl_id_collapsed' column already exists in data.var"
219
+ assert (
220
+ "n_counts" in data.obs.columns
221
+ ), "'n_counts' column missing from data.obs"
222
+
223
+ if custom_attr_name_dict is not None:
224
+ for label in custom_attr_name_dict:
225
+ assert label in data.obs.columns, f"Attribute `{label}` not present in data.obs"
226
 
227
 
228
  # Get the ensembl ids that exist in data
 
476
  self.collapse_gene_ids,
477
  self.gene_mapping_dict,
478
  self.gene_token_dict,
479
+ self.custom_attr_name_dict,
480
  file_format="h5ad",
481
  chunk_size=self.chunk_size,
482
  )
 
553
  self.collapse_gene_ids,
554
  self.gene_mapping_dict,
555
  self.gene_token_dict,
556
+ self.custom_attr_name_dict,
557
  file_format="loom",
558
  chunk_size=self.chunk_size,
559
  )