update tokenizer to use total counts
Browse files- geneformer/tokenizer.py +6 -1
geneformer/tokenizer.py
CHANGED
@@ -183,7 +183,12 @@ class TranscriptomeTokenizer:
|
|
183 |
filter_pass_loc, coding_miRNA_loc # filter cells and genes
|
184 |
]
|
185 |
|
186 |
-
X_norm = (
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
tokenized_cells += [
|
189 |
tokenize_cell(X_norm[i, ...].A.flatten(), coding_miRNA_tokens)
|
|
|
183 |
filter_pass_loc, coding_miRNA_loc # filter cells and genes
|
184 |
]
|
185 |
|
186 |
+
X_norm = (
|
187 |
+
adata_filter.X
|
188 |
+
/ adata.obs["n_counts"].values.reshape(-1, 1)
|
189 |
+
* 10_000
|
190 |
+
/ norm_factor_vector
|
191 |
+
).tocsr()
|
192 |
|
193 |
tokenized_cells += [
|
194 |
tokenize_cell(X_norm[i, ...].A.flatten(), coding_miRNA_tokens)
|