Add checks for custom attributes and n_counts prior to sum ensembl id
Browse files- geneformer/tokenizer.py +17 -0
geneformer/tokenizer.py
CHANGED
@@ -88,6 +88,7 @@ def sum_ensembl_ids(
|
|
88 |
collapse_gene_ids,
|
89 |
gene_mapping_dict,
|
90 |
gene_token_dict,
|
|
|
91 |
file_format="loom",
|
92 |
chunk_size=512,
|
93 |
):
|
@@ -104,6 +105,13 @@ def sum_ensembl_ids(
|
|
104 |
"ensembl_id_collapsed" not in data.ra.keys()
|
105 |
), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
# Get the ensembl ids that exist in data
|
109 |
ensembl_ids = data.ra.ensembl_id
|
@@ -208,6 +216,13 @@ def sum_ensembl_ids(
|
|
208 |
assert (
|
209 |
"ensembl_id_collapsed" not in data.var.columns
|
210 |
), "'ensembl_id_collapsed' column already exists in data.var"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
|
213 |
# Get the ensembl ids that exist in data
|
@@ -461,6 +476,7 @@ class TranscriptomeTokenizer:
|
|
461 |
self.collapse_gene_ids,
|
462 |
self.gene_mapping_dict,
|
463 |
self.gene_token_dict,
|
|
|
464 |
file_format="h5ad",
|
465 |
chunk_size=self.chunk_size,
|
466 |
)
|
@@ -537,6 +553,7 @@ class TranscriptomeTokenizer:
|
|
537 |
self.collapse_gene_ids,
|
538 |
self.gene_mapping_dict,
|
539 |
self.gene_token_dict,
|
|
|
540 |
file_format="loom",
|
541 |
chunk_size=self.chunk_size,
|
542 |
)
|
|
|
88 |
collapse_gene_ids,
|
89 |
gene_mapping_dict,
|
90 |
gene_token_dict,
|
91 |
+
custom_attr_name_dict,
|
92 |
file_format="loom",
|
93 |
chunk_size=512,
|
94 |
):
|
|
|
105 |
"ensembl_id_collapsed" not in data.ra.keys()
|
106 |
), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
|
107 |
|
108 |
+
assert (
|
109 |
+
"n_counts" in data.ca.keys()
|
110 |
+
), "'n_counts' column missing from data.ca.keys()"
|
111 |
+
|
112 |
+
if custom_attr_name_dict is not None:
|
113 |
+
for label in custom_attr_name_dict:
|
114 |
+
assert label in data.ca.keys(), f"Attribute `{label}` not present in dataset features"
|
115 |
|
116 |
# Get the ensembl ids that exist in data
|
117 |
ensembl_ids = data.ra.ensembl_id
|
|
|
216 |
assert (
|
217 |
"ensembl_id_collapsed" not in data.var.columns
|
218 |
), "'ensembl_id_collapsed' column already exists in data.var"
|
219 |
+
assert (
|
220 |
+
"n_counts" in data.obs.columns
|
221 |
+
), "'n_counts' column missing from data.obs"
|
222 |
+
|
223 |
+
if custom_attr_name_dict is not None:
|
224 |
+
for label in custom_attr_name_dict:
|
225 |
+
assert label in data.obs.columns, f"Attribute `{label}` not present in data.obs"
|
226 |
|
227 |
|
228 |
# Get the ensembl ids that exist in data
|
|
|
476 |
self.collapse_gene_ids,
|
477 |
self.gene_mapping_dict,
|
478 |
self.gene_token_dict,
|
479 |
+
self.custom_attr_name_dict,
|
480 |
file_format="h5ad",
|
481 |
chunk_size=self.chunk_size,
|
482 |
)
|
|
|
553 |
self.collapse_gene_ids,
|
554 |
self.gene_mapping_dict,
|
555 |
self.gene_token_dict,
|
556 |
+
self.custom_attr_name_dict,
|
557 |
file_format="loom",
|
558 |
chunk_size=self.chunk_size,
|
559 |
)
|