Upload Nicheformer model

Browse files

Files changed (10) hide show

LICENSE +21 -0
README.md +74 -0
__init__.py +6 -0
config.json +23 -0
configuration_nicheformer.py +60 -0
masking.py +63 -0
model.safetensors +3 -0
modeling_nicheformer.py +162 -0
tokenization_nicheformer.py +330 -0
vocab.json +0 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Nicheformer Contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,74 @@

+# Nicheformer
+Nicheformer is a transformer-based model designed for understanding and predicting cellular niches and their interactions. The model uses masked language modeling to learn representations of cellular contexts and their relationships.
+## Model Description
+Nicheformer is built on a transformer architecture with the following key features:
+- **Architecture**: Transformer encoder with customizable number of layers and attention heads
+- **Pre-training**: Masked Language Modeling (MLM) objective with dynamic masking
+- **Input Processing**: Handles cell type, assay, and modality information
+- **Positional Encoding**: Supports both learnable and fixed positional embeddings
+- **Masking Strategy**:
+  - 80% of selected tokens are replaced with [MASK]
+  - 10% are replaced with random tokens
+  - 10% remain unchanged
+### Model Architecture
+- Transformer encoder layers: 12
+- Hidden dimension: 512
+- Attention heads: 16
+- Feedforward dimension: 1024
+- Maximum sequence length: 1500
+- Vocabulary size: 25000
+- Masking probability: 15%
+## Usage
+```python
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+# Load model and tokenizer
+model = AutoModelForMaskedLM.from_pretrained("your-username/nicheformer")
+tokenizer = AutoTokenizer.from_pretrained("your-username/nicheformer")
+# Example 1: Manual masking
+masked_text = "The [MASK] cell is an important immune cell type."
+inputs = tokenizer(masked_text, return_tensors="pt")
+outputs = model(**inputs)
+# Example 2: Automatic masking (typically used during training)
+text = "The T cell is an important immune cell type."
+inputs = tokenizer(text, return_tensors="pt")
+outputs = model(**inputs, apply_masking=True)  # This will automatically mask tokens
+```
+## Training Data
+[Describe the training data used for the model]
+## Evaluation Results
+[Include evaluation metrics and results]
+## Limitations
+[Describe any known limitations or biases of the model]
+## Citation
+If you use this model in your research, please cite:
+```bibtex
+[Add citation information]
+```
+## License
+This model is released under [specify license]
+## Contact
+[Add contact information for questions and issues]

__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .configuration_nicheformer import NicheformerConfig
+from .modeling_nicheformer import (
+    NicheformerPreTrainedModel,
+    NicheformerModel,
+    NicheformerForMaskedLM
+)

config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "architectures": [
+    "NicheformerForMaskedLM"
+  ],
+  "assay": true,
+  "batch_first": true,
+  "cls_classes": 164,
+  "context_length": 1500,
+  "dim_feedforward": 1024,
+  "dim_model": 512,
+  "dropout": 0.0,
+  "learnable_pe": true,
+  "masking_p": 0.15,
+  "modality": true,
+  "model_type": "nicheformer",
+  "n_tokens": 20340,
+  "nheads": 16,
+  "nlayers": 12,
+  "specie": true,
+  "supervised_task": null,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0"
+}

configuration_nicheformer.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from transformers import PretrainedConfig
+class NicheformerConfig(PretrainedConfig):
+    model_type = "nicheformer"
+    def __init__(
+        self,
+        dim_model=512,
+        nheads=16,
+        dim_feedforward=1024,
+        nlayers=12,
+        dropout=0.0,
+        batch_first=True,
+        masking_p=0.15,
+        n_tokens=20340,
+        context_length=1500,
+        cls_classes=164,
+        supervised_task=None,
+        learnable_pe=True,
+        specie=True,
+        assay=True,
+        modality=True,
+        **kwargs
+    ):
+        """Initialize NicheformerConfig.
+        Args:
+            dim_model: Dimensionality of the model
+            nheads: Number of attention heads
+            dim_feedforward: Dimensionality of MLPs in attention blocks
+            nlayers: Number of transformer layers
+            dropout: Dropout probability
+            batch_first: Whether batch dimension is first
+            masking_p: Probability of masking tokens
+            n_tokens: Total number of tokens (excluding auxiliary)
+            context_length: Length of the context window
+            cls_classes: Number of classification classes
+            supervised_task: Type of supervised task
+            learnable_pe: Whether to use learnable positional embeddings
+            specie: Whether to add specie token
+            assay: Whether to add assay token
+            modality: Whether to add modality token
+        """
+        super().__init__(**kwargs)
+        self.dim_model = dim_model
+        self.nheads = nheads
+        self.dim_feedforward = dim_feedforward
+        self.nlayers = nlayers
+        self.dropout = dropout
+        self.batch_first = batch_first
+        self.masking_p = masking_p
+        self.n_tokens = n_tokens
+        self.context_length = context_length
+        self.cls_classes = cls_classes
+        self.supervised_task = supervised_task
+        self.learnable_pe = learnable_pe
+        self.specie = specie
+        self.assay = assay
+        self.modality = modality

masking.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+import random
+MASK_TOKEN = 0
+PAD_TOKEN = 1
+CLS_TOKEN = 2
+def complete_masking(batch, masking_p, n_tokens):
+    """Apply masking to input batch for masked language modeling.
+    Args:
+        batch (dict): Input batch containing 'input_ids' and 'attention_mask'
+        masking_p (float): Probability of masking a token
+        n_tokens (int): Total number of tokens in vocabulary
+    Returns:
+        dict: Batch with masked indices and masking information
+    """
+    device = batch['input_ids'].device
+    input_ids = batch['input_ids']
+    attention_mask = batch['attention_mask']
+    # Create mask tensor (1 for tokens to be masked, 0 otherwise)
+    prob = torch.rand(input_ids.shape, device=device)
+    mask = (prob < masking_p) & (input_ids != PAD_TOKEN) & (input_ids != CLS_TOKEN)
+    # For masked tokens:
+    # - 80% replace with MASK token
+    # - 10% replace with random token
+    # - 10% keep unchanged
+    masked_indices = input_ids.clone()
+    # Calculate number of tokens to be masked
+    num_tokens_to_mask = mask.sum().item()
+    # Determine which tokens get which type of masking
+    mask_mask = torch.rand(num_tokens_to_mask, device=device) < 0.8
+    random_mask = (torch.rand(num_tokens_to_mask, device=device) < 0.5) & ~mask_mask
+    # Apply MASK token (80% of masked tokens)
+    masked_indices[mask] = torch.where(
+        mask_mask,
+        torch.tensor(MASK_TOKEN, device=device, dtype=torch.long),
+        masked_indices[mask]
+    )
+    # Apply random tokens (10% of masked tokens)
+    random_tokens = torch.randint(
+        3, n_tokens,  # Start from 3 to avoid special tokens
+        (random_mask.sum(),),
+        device=device,
+        dtype=torch.long
+    )
+    masked_indices[mask][random_mask] = random_tokens
+    # 10% remain unchanged
+    return {
+        'masked_indices': masked_indices,
+        'attention_mask': attention_mask,
+        'mask': mask,
+        'input_ids': input_ids
+    }

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65fff02ac9bdd978cbb06b158a60d94ec59a96f30e706aed2c56d213d582e603
+size 195851648

modeling_nicheformer.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import MaskedLMOutput
+from configuration_nicheformer import NicheformerConfig
+from masking import complete_masking, MASK_TOKEN, PAD_TOKEN, CLS_TOKEN
+import math
+class PositionalEncoding(nn.Module):
+    """Positional encoding using sine and cosine functions."""
+    def __init__(self, d_model: int, max_seq_len: int):
+        super().__init__()
+        encoding = torch.zeros(max_seq_len, d_model)
+        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        encoding[:, 0::2] = torch.sin(position * div_term)
+        encoding[:, 1::2] = torch.cos(position * div_term)
+        encoding = encoding.unsqueeze(0)
+        self.register_buffer('encoding', encoding, persistent=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Add positional encoding to input tensor."""
+        return x + self.encoding[:, :x.size(1)]
+class NicheformerPreTrainedModel(PreTrainedModel):
+    """Base class for Nicheformer models."""
+    config_class = NicheformerConfig
+    base_model_prefix = "nicheformer"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            nn.init.xavier_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+class NicheformerModel(NicheformerPreTrainedModel):
+    def __init__(self, config: NicheformerConfig):
+        super().__init__(config)
+        # Core transformer components
+        self.encoder_layer = nn.TransformerEncoderLayer(
+            d_model=config.dim_model,
+            nhead=config.nheads,
+            dim_feedforward=config.dim_feedforward,
+            batch_first=config.batch_first,
+            dropout=config.dropout,
+            layer_norm_eps=1e-12
+        )
+        self.encoder = nn.TransformerEncoder(
+            encoder_layer=self.encoder_layer,
+            num_layers=config.nlayers,
+            enable_nested_tensor=False
+        )
+        # Embedding layers
+        self.embeddings = nn.Embedding(
+            num_embeddings=config.n_tokens+5,
+            embedding_dim=config.dim_model,
+            padding_idx=1
+        )
+        if config.learnable_pe:
+            self.positional_embedding = nn.Embedding(
+                num_embeddings=config.context_length,
+                embedding_dim=config.dim_model
+            )
+            self.dropout = nn.Dropout(p=config.dropout)
+            self.register_buffer('pos', torch.arange(0, config.context_length, dtype=torch.long))
+        else:
+            self.positional_embedding = PositionalEncoding(
+                d_model=config.dim_model,
+                max_seq_len=config.context_length
+            )
+        # Initialize weights
+        self.post_init()
+    def forward(self, input_ids, attention_mask=None):
+        token_embedding = self.embeddings(input_ids)
+        if self.config.learnable_pe:
+            pos_embedding = self.positional_embedding(self.pos.to(token_embedding.device))
+            embeddings = self.dropout(token_embedding + pos_embedding)
+        else:
+            embeddings = self.positional_embedding(token_embedding)
+        # Convert attention_mask to boolean and invert it for transformer's src_key_padding_mask
+        # True indicates positions that will be masked
+        if attention_mask is not None:
+            attention_mask = ~attention_mask.bool()
+        transformer_output = self.encoder(
+            embeddings,
+            src_key_padding_mask=attention_mask if attention_mask is not None else None,
+            is_causal=False
+        )
+        return transformer_output
+class NicheformerForMaskedLM(NicheformerPreTrainedModel):
+    def __init__(self, config: NicheformerConfig):
+        super().__init__(config)
+        self.nicheformer = NicheformerModel(config)
+        self.classifier_head = nn.Linear(config.dim_model, config.n_tokens, bias=False)
+        self.classifier_head.bias = nn.Parameter(torch.zeros(config.n_tokens))
+        # Initialize weights
+        self.post_init()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        labels=None,
+        return_dict=None,
+        apply_masking=False,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Apply masking if requested (typically during training)
+        if apply_masking:
+            batch = {
+                'input_ids': input_ids,
+                'attention_mask': attention_mask
+            }
+            masked_batch = complete_masking(batch, self.config.masking_p, self.config.n_tokens)
+            input_ids = masked_batch['masked_indices']
+            labels = masked_batch['input_ids']  # Original tokens become labels
+            mask = masked_batch['mask']
+            # Only compute loss on masked tokens and ensure labels are long
+            labels = torch.where(mask, labels, torch.tensor(-100, device=labels.device)).long()
+        transformer_output = self.nicheformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )
+        prediction_scores = self.classifier_head(transformer_output)
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.n_tokens),
+                labels.view(-1)
+            )
+        if not return_dict:
+            output = (prediction_scores,) + (transformer_output,)
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=transformer_output,
+        )

tokenization_nicheformer.py ADDED Viewed

	@@ -0,0 +1,330 @@

+from typing import List, Dict, Optional, Union, Tuple
+import numpy as np
+from transformers import PreTrainedTokenizer
+from dataclasses import dataclass
+import torch
+import anndata as ad
+from scipy.sparse import issparse
+import numba
+import os
+import json
+# Token IDs must match exactly with the original implementation
+PAD_TOKEN = 0
+MASK_TOKEN = 1
+CLS_TOKEN = 2
+# These mappings preserve the exact token IDs from the original implementation
+MODALITY_DICT = {
+    'dissociated': 3,
+    'spatial': 4,
+}
+SPECIES_DICT = {
+    'human': 5,
+    'Homo sapiens': 5,
+    'Mus musculus': 6,
+    'mouse': 6,
+}
+TECHNOLOGY_DICT = {
+    "merfish": 7,
+    "MERFISH": 7,
+    "cosmx": 8,
+    "NanoString digital spatial profiling": 8,
+    "Xenium": 9,
+    "10x 5' v2": 10,
+    "10x 3' v3": 11,
+    "10x 3' v2": 12,
+    "10x 5' v1": 13,
+    "10x 3' v1": 14,
+    "10x 3' transcription profiling": 15,
+    "10x transcription profiling": 15,
+    "10x 5' transcription profiling": 16,
+    "CITE-seq": 17,
+    "Smart-seq v4": 18,
+}
+def sf_normalize(X: np.ndarray) -> np.ndarray:
+    """Size factor normalize to 10k counts."""
+    X = X.copy()
+    counts = np.array(X.sum(axis=1))
+    # avoid zero division error
+    counts += counts == 0.
+    # normalize to 10000 counts
+    scaling_factor = 10000. / counts
+    if issparse(X):
+        from scipy.sparse import sparsefuncs
+        sparsefuncs.inplace_row_scale(X, scaling_factor)
+    else:
+        np.multiply(X, scaling_factor.reshape((-1, 1)), out=X)
+    return X
+@numba.jit(nopython=True, nogil=True)
+def _sub_tokenize_data(x: np.ndarray, max_seq_len: int = -1, aux_tokens: int = 30) -> np.ndarray:
+    """Tokenize the input gene vector."""
+    scores_final = np.empty((x.shape[0], max_seq_len if max_seq_len > 0 else x.shape[1]))
+    for i, cell in enumerate(x):
+        nonzero_mask = np.nonzero(cell)[0]
+        sorted_indices = nonzero_mask[np.argsort(-cell[nonzero_mask])][:max_seq_len]
+        sorted_indices = sorted_indices + aux_tokens
+        if max_seq_len:
+            scores = np.zeros(max_seq_len, dtype=np.int32)
+        else:
+            scores = np.zeros_like(cell, dtype=np.int32)
+        scores[:len(sorted_indices)] = sorted_indices.astype(np.int32)
+        scores_final[i, :] = scores
+    return scores_final
+class NicheformerTokenizer(PreTrainedTokenizer):
+    """Tokenizer for Nicheformer that handles single-cell data."""
+    model_input_names = ["input_ids", "attention_mask"]
+    vocab_files_names = {"vocab_file": "vocab.json"}
+    modality_dict = MODALITY_DICT
+    species_dict = SPECIES_DICT
+    technology_dict = TECHNOLOGY_DICT
+    def __init__(
+        self,
+        vocab_file=None,
+        max_length: int = 1500,
+        aux_tokens: int = 30,
+        median_counts_per_gene: Optional[np.ndarray] = None,
+        gene_names: Optional[List[str]] = None,
+        **kwargs
+    ):
+        # Initialize base vocabulary
+        self._vocabulary = {
+            "[PAD]": PAD_TOKEN,
+            "[MASK]": MASK_TOKEN,
+            "[CLS]": CLS_TOKEN,
+        }
+        if vocab_file is not None:
+            with open(vocab_file, 'r') as f:
+                self._vocabulary.update(json.load(f))
+        else:
+            # Add modality tokens
+            for name, idx in self.modality_dict.items():
+                self._vocabulary[f"[MODALITY_{name}]"] = idx
+            # Add species tokens
+            for name, idx in self.species_dict.items():
+                if name in ["Homo sapiens", "Mus musculus"]:
+                    continue  # Skip redundant names
+                self._vocabulary[f"[SPECIES_{name}]"] = idx
+            # Add technology tokens
+            for name, idx in self.technology_dict.items():
+                if name in ["MERFISH", "10x transcription profiling"]:
+                    continue  # Skip redundant names
+                clean_name = name.lower().replace(" ", "_").replace("'", "_")
+                self._vocabulary[f"[TECH_{clean_name}]"] = idx
+            # Add gene tokens if provided
+            if gene_names is not None:
+                for i, gene in enumerate(gene_names):
+                    self._vocabulary[gene] = i + aux_tokens
+                # Save vocabulary
+                os.makedirs('to_hf', exist_ok=True)
+                with open('to_hf/vocab.json', 'w') as f:
+                    json.dump(self._vocabulary, f, indent=4)
+        super().__init__(**kwargs)
+        self.max_length = max_length
+        self.aux_tokens = aux_tokens
+        self.median_counts_per_gene = median_counts_per_gene
+        self.gene_names = gene_names
+        # Set up special token mappings
+        self._pad_token = "[PAD]"
+        self._mask_token = "[MASK]"
+        self._cls_token = "[CLS]"
+    def get_vocab(self) -> Dict[str, int]:
+        """Returns the vocabulary mapping."""
+        return self._vocabulary.copy()
+    def _tokenize(self, text: str) -> List[str]:
+        """Tokenize text input."""
+        # This tokenizer doesn't handle text input directly
+        raise NotImplementedError("This tokenizer only works with gene expression data")
+    def _convert_token_to_id(self, token: str) -> int:
+        """Convert token to ID."""
+        # First check special token mappings
+        if token in self.modality_dict:
+            return self.modality_dict[token]
+        if token in self.species_dict:
+            return self.species_dict[token]
+        if token in self.technology_dict:
+            return self.technology_dict[token]
+        # Then check vocabulary
+        return self._vocabulary.get(token, self._vocabulary["[PAD]"])
+    def _convert_id_to_token(self, index: int) -> str:
+        """Convert ID to token."""
+        # First check special token mappings
+        for token, idx in self.modality_dict.items():
+            if idx == index:
+                return token
+        for token, idx in self.species_dict.items():
+            if idx == index:
+                return token
+        for token, idx in self.technology_dict.items():
+            if idx == index:
+                return token
+        # Then check vocabulary
+        for token, idx in self._vocabulary.items():
+            if idx == index:
+                return token
+        return "[PAD]"
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """Save the vocabulary to a file."""
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump(self._vocabulary, f, ensure_ascii=False)
+        return (vocab_file,)
+    def _tokenize_gene_expression(self, x: np.ndarray) -> np.ndarray:
+        """Tokenize gene expression matrix.
+        Args:
+            x: Gene expression matrix (cells x genes)
+        Returns:
+            Tokenized matrix
+        """
+        # Handle sparse input
+        if issparse(x):
+            x = x.toarray()
+        # Normalize and scale
+        x = np.nan_to_num(x)
+        x = sf_normalize(x)
+        if self.median_counts_per_gene is not None:
+            median_counts = self.median_counts_per_gene.copy()
+            median_counts += median_counts == 0
+            x = x / median_counts.reshape((1, -1))
+        # Convert to tokens
+        tokens = _sub_tokenize_data(x, self.max_length, self.aux_tokens)
+        return tokens.astype(np.int32)
+    def __call__(
+        self,
+        adata: Optional[ad.AnnData] = None,
+        gene_expression: Optional[Union[np.ndarray, List[float]]] = None,
+        modality: Optional[str] = None,
+        species: Optional[str] = None,
+        technology: Optional[str] = None,
+        **kwargs
+    ) -> Dict[str, torch.Tensor]:
+        """Convert inputs to model inputs.
+        Args:
+            adata: AnnData object
+            gene_expression: Gene expression matrix if not using AnnData
+            modality: Modality type
+            species: Species type
+            technology: Technology/assay type
+        Returns:
+            Dictionary with model inputs
+        """
+        if adata is not None:
+            # Get expression matrix
+            if issparse(adata.X):
+                x = adata.X.toarray()
+            else:
+                x = adata.X
+            # Get metadata for each cell if not provided
+            if modality is None and 'modality' in adata.obs:
+                modality = adata.obs['modality'].values
+            if species is None and 'specie' in adata.obs:
+                species = adata.obs['specie'].values
+            if technology is None and 'assay' in adata.obs:
+                technology = adata.obs['assay'].values
+        elif gene_expression is not None:
+            x = np.array(gene_expression)
+            if len(x.shape) == 1:
+                x = x.reshape(1, -1)
+            # For single gene expression input, convert scalar metadata to arrays
+            if modality is not None:
+                modality = np.array([modality])
+            if species is not None:
+                species = np.array([species])
+            if technology is not None:
+                technology = np.array([technology])
+        else:
+            raise ValueError("Either adata or gene_expression must be provided")
+        # Tokenize gene expression
+        token_ids = self._tokenize_gene_expression(x)
+        n_cells = token_ids.shape[0]
+        # Add special tokens for each cell
+        special_tokens = np.zeros((n_cells, 3), dtype=np.int32)  # 3 for modality, species, technology
+        special_token_mask = np.zeros((n_cells, 3), dtype=bool)  # Track which tokens are actually present
+        if modality is not None:
+            special_tokens[:, 0] = [self.modality_dict.get(m, self._vocabulary["[PAD]"]) for m in modality]
+            special_token_mask[:, 0] = True
+        if species is not None:
+            special_tokens[:, 1] = [self.species_dict.get(s, self._vocabulary["[PAD]"]) for s in species]
+            special_token_mask[:, 1] = True
+        if technology is not None:
+            special_tokens[:, 2] = [self.technology_dict.get(t, self._vocabulary["[PAD]"]) for t in technology]
+            special_token_mask[:, 2] = True
+        # Only keep the special tokens that are present (have True in mask)
+        special_tokens = special_tokens[:, special_token_mask[0]]
+        if special_tokens.size > 0:
+            token_ids = np.concatenate([special_tokens, token_ids[:, :(self.max_length - special_tokens.shape[1])]], axis=1)
+        # Create attention mask
+        attention_mask = (token_ids != self._vocabulary["[PAD]"])
+        return {
+            "input_ids": torch.tensor(token_ids, dtype=torch.long),
+            "attention_mask": torch.tensor(attention_mask)
+        }
+    def get_vocab_size(self) -> int:
+        """Get vocabulary size."""
+        if self.gene_names is not None:
+            return len(self.gene_names) + self.aux_tokens
+        return max(
+            max(self.modality_dict.values()),
+            max(self.species_dict.values()),
+            max(self.technology_dict.values())
+        ) + 1
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Convert a sequence of tokens to a string. Not used for gene expression."""
+        raise NotImplementedError("This tokenizer only works with gene expression data")
+    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """Build model inputs from a sequence by adding special tokens."""
+        # For gene expression data, special tokens are handled in __call__
+        return token_ids_0
+    def get_special_tokens_mask(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False) -> List[int]:
+        """Get list where entries are [1] if a token is [special] else [0]."""
+        # Consider tokens < aux_tokens as special
+        return [1 if token_id < self.aux_tokens else 0 for token_id in token_ids_0]

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff