theislab
/

Nicheformer

transcriptomics

Model card Files Files and versions

aletlvl commited on Mar 24

Commit

211103a

·

verified ·

1 Parent(s): fadb7a9

Tokenization with model alignment

Files changed (1) hide show

tokenization_nicheformer.py +22 -0

tokenization_nicheformer.py CHANGED Viewed

@@ -8,6 +8,7 @@ from scipy.sparse import issparse
 import numba
 import os
 import json
 # Token IDs must match exactly with the original implementation
 PAD_TOKEN = 0
@@ -88,6 +89,19 @@ class NicheformerTokenizer(PreTrainedTokenizer):
     species_dict = SPECIES_DICT
     technology_dict = TECHNOLOGY_DICT
     def __init__(
         self,
         vocab_file=None,
@@ -138,6 +152,7 @@ class NicheformerTokenizer(PreTrainedTokenizer):
         self.aux_tokens = aux_tokens
         self.median_counts_per_gene = median_counts_per_gene
         self.gene_names = gene_names
         # Set up special token mappings
         self._pad_token = "[PAD]"
@@ -243,6 +258,13 @@ class NicheformerTokenizer(PreTrainedTokenizer):
             Dictionary with model inputs
         """
         if adata is not None:
             # Get expression matrix
             if issparse(adata.X):
                 x = adata.X.toarray()

 import numba
 import os
 import json
+from huggingface_hub import hf_hub_download
 # Token IDs must match exactly with the original implementation
 PAD_TOKEN = 0
     species_dict = SPECIES_DICT
     technology_dict = TECHNOLOGY_DICT
+    def _load_reference_model(self):
+        """Load reference model for gene alignment."""
+        try:
+            # Get the model name or path from the tokenizer
+            repo_id = self.name_or_path if hasattr(self, "name_or_path") else "aletlvl/Nicheformer"
+            # Download the reference model if not already cached
+            model_path = hf_hub_download(repo_id=repo_id, filename="model.h5ad")
+            return ad.read_h5ad(model_path)
+        except Exception as e:
+            print(f"Warning: Could not load reference model: {e}")
+            return None
     def __init__(
         self,
         vocab_file=None,
         self.aux_tokens = aux_tokens
         self.median_counts_per_gene = median_counts_per_gene
         self.gene_names = gene_names
+        self.name_or_path = kwargs.get('name_or_path', 'aletlvl/Nicheformer')
         # Set up special token mappings
         self._pad_token = "[PAD]"
             Dictionary with model inputs
         """
         if adata is not None:
+            # Align with reference model if needed
+            reference_model = self._load_reference_model()
+            if reference_model is not None:
+                # Concatenate and then remove the reference
+                adata = ad.concat([reference_model, adata], join='outer', axis=0)
+                adata = adata[1:]
             # Get expression matrix
             if issparse(adata.X):
                 x = adata.X.toarray()