theislab
/

Nicheformer

@@ -33,27 +33,27 @@ class NicheformerTokenizer(PreTrainedTokenizer):
             max_seq_len: Maximum sequence length
             aux_tokens: Number of auxiliary tokens reserved
         """
-        # Initialize the parent class first
-        super().__init__(
-            pad_token="<pad>",
-            eos_token="<eos>",
-            unk_token="<unk>",
-            **kwargs
-        )
-        self.max_seq_len = max_seq_len
-        self.aux_tokens = aux_tokens
         # Initialize vocabulary
         self.vocab = {}
         self.ids_to_tokens = {}
         # Load vocabulary if provided
-        if vocab_file is not None:
             with open(vocab_file, 'r', encoding='utf-8') as f:
                 self.vocab = json.load(f)
                 self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
         # Define token constants to match Nicheformer
         self._pad_token_id = 0
@@ -86,43 +86,27 @@ class NicheformerTokenizer(PreTrainedTokenizer):
             "CITE-seq": 17,
             "Smart-seq v4": 18,
         }
-        # Create vocabulary
-        if vocab_file is not None and os.path.isfile(vocab_file):
-            with open(vocab_file, "r", encoding="utf-8") as f:
-                self.vocab = json.load(f)
-        else:
-            # Create a basic vocabulary with special tokens
-            self.vocab = {
-                "<pad>": 0,
-                "<unk>": 1,
-                "<mask>": 2,
-            }
             # Add modality tokens
             for token, idx in self.modality_dict.items():
-                self.vocab[f"<modality_{token}>"] = idx
             # Add species tokens
             for token, idx in self.specie_dict.items():
-                self.vocab[f"<species_{token}>"] = idx
             # Add technology tokens
             for token, idx in self.technology_dict.items():
-                self.vocab[f"<technology_{token}>"] = idx
-            # Reserve space for gene tokens (starting from aux_tokens)
-            # In a real implementation, you would add actual gene names here
-        # Create reverse vocabulary (id to token)
-        self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-    def get_vocab(self):
-        return dict(self.vocab)
     def _tokenize(self, text):
         """

             max_seq_len: Maximum sequence length
             aux_tokens: Number of auxiliary tokens reserved
         """
         # Initialize vocabulary
         self.vocab = {}
         self.ids_to_tokens = {}
         # Load vocabulary if provided
+        if vocab_file is not None and os.path.isfile(vocab_file):
             with open(vocab_file, 'r', encoding='utf-8') as f:
                 self.vocab = json.load(f)
                 self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
+        # Initialize the parent class
+        super().__init__(
+            pad_token="<pad>",
+            eos_token="<eos>",
+            unk_token="",
+            **kwargs
+        )
+        self.max_seq_len = max_seq_len
+        self.aux_tokens = aux_tokens
         # Define token constants to match Nicheformer
         self._pad_token_id = 0
             "CITE-seq": 17,
             "Smart-seq v4": 18,
         }
+    def get_vocab(self) -> Dict[str, int]:
+        """Return the vocabulary as a dictionary of token to index."""
+        if not self.vocab:
+            # If vocab is empty, create a minimal vocab with special tokens
+            vocab = {}
+            # Add special tokens
+            vocab["<pad>"] = 0
+            vocab["<eos>"] = 1
+            vocab[""] = 2
             # Add modality tokens
             for token, idx in self.modality_dict.items():
+                vocab[token] = idx
             # Add species tokens
             for token, idx in self.specie_dict.items():
+                vocab[token] = idx
             # Add technology tokens
             for token, idx in self.technology_dict.items():
+                vocab[token] = idx
+            return vocab
+        return self.vocab
     def _tokenize(self, text):
         """