theislab
/

Nicheformer

@@ -287,11 +287,6 @@ class NicheformerTokenizer(PreTrainedTokenizer):
                     for col in ['modality', 'specie', 'assay']:
                         if col in adata.obs.columns:
                             original_types[col] = adata.obs[col].dtype
-                    print(f"modality dtype: {adata.obs['modality'].dtype}")
-                    print(f"specie dtype: {adata.obs['specie'].dtype}")
-                    print(f"assay dtype: {adata.obs['assay'].dtype}")
                     # Concatenate and then remove the reference
                     adata = ad.concat([reference_model, adata], join='outer', axis=0)
@@ -304,10 +299,6 @@ class NicheformerTokenizer(PreTrainedTokenizer):
                                 adata.obs[col] = adata.obs[col].astype(dtype)
                             except Exception as e:
                                 print(f"Warning: Could not convert {col} back to {dtype}: {e}")
-                    print(f"modality dtype: {adata.obs['modality'].dtype}")
-                    print(f"specie dtype: {adata.obs['specie'].dtype}")
-                    print(f"assay dtype: {adata.obs['assay'].dtype}")
             # Get gene expression data
             X = adata.X
@@ -362,20 +353,20 @@ class NicheformerTokenizer(PreTrainedTokenizer):
         # Tokenize gene expression data
         token_ids = self._tokenize_gene_expression(X)
-        # Add special tokens if available
         special_tokens = np.zeros((token_ids.shape[0], 3), dtype=np.int64)
         special_token_mask = np.zeros((token_ids.shape[0], 3), dtype=bool)
-        if modality_tokens is not None:
-            special_tokens[:, 0] = modality_tokens
-            special_token_mask[:, 0] = True
         if species_tokens is not None:
-            special_tokens[:, 1] = species_tokens
-            special_token_mask[:, 1] = True
         if technology_tokens is not None:
-            special_tokens[:, 2] = technology_tokens
             special_token_mask[:, 2] = True
         # Only keep the special tokens that are present (have True in mask)

                     for col in ['modality', 'specie', 'assay']:
                         if col in adata.obs.columns:
                             original_types[col] = adata.obs[col].dtype
                     # Concatenate and then remove the reference
                     adata = ad.concat([reference_model, adata], join='outer', axis=0)
                                 adata.obs[col] = adata.obs[col].astype(dtype)
                             except Exception as e:
                                 print(f"Warning: Could not convert {col} back to {dtype}: {e}")
             # Get gene expression data
             X = adata.X
         # Tokenize gene expression data
         token_ids = self._tokenize_gene_expression(X)
+        # Add special tokens if available - changed order to [species, technology, modality]
         special_tokens = np.zeros((token_ids.shape[0], 3), dtype=np.int64)
         special_token_mask = np.zeros((token_ids.shape[0], 3), dtype=bool)
         if species_tokens is not None:
+            special_tokens[:, 0] = species_tokens
+            special_token_mask[:, 0] = True
         if technology_tokens is not None:
+            special_tokens[:, 1] = technology_tokens
+            special_token_mask[:, 1] = True
+        if modality_tokens is not None:
+            special_tokens[:, 2] = modality_tokens
             special_token_mask[:, 2] = True
         # Only keep the special tokens that are present (have True in mask)