theislab
/

Nicheformer

transcriptomics

Model card Files Files and versions

aletlvl commited on Mar 24

Commit

6326f97

·

verified ·

1 Parent(s): 833a4d7

Tokenization fixed

Files changed (1) hide show

tokenization_nicheformer.py +29 -0

tokenization_nicheformer.py CHANGED Viewed

@@ -110,6 +110,7 @@ class NicheformerTokenizer(PreTrainedTokenizer):
         aux_tokens: int = 30,
         median_counts_per_gene: Optional[np.ndarray] = None,
         gene_names: Optional[List[str]] = None,
         **kwargs
     ):
         # Initialize base vocabulary
@@ -160,6 +161,25 @@ class NicheformerTokenizer(PreTrainedTokenizer):
         self._mask_token = "[MASK]"
         self._cls_token = "[CLS]"
     def get_vocab(self) -> Dict[str, int]:
         """Returns the vocabulary mapping."""
         return self._vocabulary.copy()
@@ -232,6 +252,15 @@ class NicheformerTokenizer(PreTrainedTokenizer):
             median_counts += median_counts == 0
             x = x / median_counts.reshape((1, -1))
         # Convert to tokens
         tokens = _sub_tokenize_data(x, self.max_length, self.aux_tokens)

         aux_tokens: int = 30,
         median_counts_per_gene: Optional[np.ndarray] = None,
         gene_names: Optional[List[str]] = None,
+        technology_mean: Optional[Union[str, np.ndarray]] = None,
         **kwargs
     ):
         # Initialize base vocabulary
         self._mask_token = "[MASK]"
         self._cls_token = "[CLS]"
+        # Load technology mean if provided
+        self.technology_mean = None
+        if technology_mean is not None:
+            self._load_technology_mean(technology_mean)
+    def _load_technology_mean(self, technology_mean):
+        """Load technology mean from file or array."""
+        if isinstance(technology_mean, str):
+            try:
+                self.technology_mean = np.load(technology_mean)
+                print(f"Loaded technology mean from {technology_mean} with shape {self.technology_mean.shape}")
+            except Exception as e:
+                print(f"Warning: Could not load technology mean from {technology_mean}: {e}")
+        elif isinstance(technology_mean, np.ndarray):
+            self.technology_mean = technology_mean
+            print(f"Using provided technology mean array with shape {self.technology_mean.shape}")
+        else:
+            print(f"Warning: Invalid technology_mean type: {type(technology_mean)}")
     def get_vocab(self) -> Dict[str, int]:
         """Returns the vocabulary mapping."""
         return self._vocabulary.copy()
             median_counts += median_counts == 0
             x = x / median_counts.reshape((1, -1))
+        # Apply technology mean normalization if available
+        if self.technology_mean is not None and self.technology_mean.shape[0] == x.shape[1]:
+            # Avoid division by zero
+            safe_mean = np.maximum(self.technology_mean, 1e-6)
+            x = x / safe_mean
+        # Apply log1p transformation
+        x = np.log1p(x)
         # Convert to tokens
         tokens = _sub_tokenize_data(x, self.max_length, self.aux_tokens)