Add classifier module and examples

Files changed (12) hide show

docs/source/api.rst +8 -0
docs/source/geneformer.classifier.rst +9 -0
examples/cell_classification.ipynb +0 -0
examples/extract_and_plot_cell_embeddings.ipynb +1 -1
examples/gene_classification.ipynb +0 -0
geneformer/__init__.py +18 -10
geneformer/classifier.py +1203 -0
geneformer/classifier_utils.py +406 -0
geneformer/emb_extractor.py +3 -9
geneformer/evaluation_utils.py +287 -0
geneformer/in_silico_perturber_stats.py +6 -0
geneformer/tokenizer.py +25 -14

docs/source/api.rst CHANGED Viewed

@@ -9,6 +9,14 @@ Tokenizer
    geneformer.tokenizer
 Embedding Extractor
 -------------------

    geneformer.tokenizer
+Classifier
+----------
+.. toctree::
+   :maxdepth: 1
+   geneformer.classifier
 Embedding Extractor
 -------------------

docs/source/geneformer.classifier.rst ADDED Viewed

	@@ -0,0 +1,9 @@

+geneformer.classifier
+=====================
+.. automodule:: geneformer.classifier
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :exclude-members:
+        validate_options

examples/cell_classification.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

examples/extract_and_plot_cell_embeddings.ipynb CHANGED Viewed

@@ -129,7 +129,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.11"
   }
  },
  "nbformat": 4,

    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,

examples/gene_classification.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

geneformer/__init__.py CHANGED Viewed

@@ -1,12 +1,20 @@
-from . import tokenizer
-from . import pretrainer
-from . import collator_for_classification
-from . import in_silico_perturber
-from . import in_silico_perturber_stats
-from .tokenizer import TranscriptomeTokenizer
-from .pretrainer import GeneformerPretrainer
-from .collator_for_classification import DataCollatorForGeneClassification
-from .collator_for_classification import DataCollatorForCellClassification
 from .emb_extractor import EmbExtractor
 from .in_silico_perturber import InSilicoPerturber
-from .in_silico_perturber_stats import InSilicoPerturberStats

+# ruff: noqa: F401
+from . import classifier  # noqa
+from . import (
+    collator_for_classification,
+    emb_extractor,
+    in_silico_perturber,
+    in_silico_perturber_stats,
+    pretrainer,
+    tokenizer,
+)
+from .classifier import Classifier
+from .collator_for_classification import (
+    DataCollatorForCellClassification,
+    DataCollatorForGeneClassification,
+)
 from .emb_extractor import EmbExtractor
 from .in_silico_perturber import InSilicoPerturber
+from .in_silico_perturber_stats import InSilicoPerturberStats
+from .pretrainer import GeneformerPretrainer
+from .tokenizer import TranscriptomeTokenizer

geneformer/classifier.py ADDED Viewed

	@@ -0,0 +1,1203 @@

+"""
+Geneformer classifier.
+**Input data:**
+Cell state classifier:
+| Single-cell transcriptomes as Geneformer rank value encodings with cell state labels
+| in Geneformer .dataset format (generated from single-cell RNAseq data by tokenizer.py)
+Gene classifier:
+| Dictionary in format {Gene_label: list(genes)} for gene labels
+| and single-cell transcriptomes as Geneformer rank value encodings
+| in Geneformer .dataset format (generated from single-cell RNAseq data by tokenizer.py)
+**Usage:**
+.. code-block :: python
+    >>> from geneformer import Classifier
+    >>> cc = Classifier(classifier="cell",  # example of cell state classifier
+    ...                 cell_state_dict={"state_key": "disease", "states": "all"},
+    ...                 filter_data={"cell_type":["Cardiomyocyte1","Cardiomyocyte2","Cardiomyocyte3"]},
+    ...                 training_args=training_args,
+    ...                 freeze_layers = 2,
+    ...                 num_crossval_splits = 1,
+    ...                 forward_batch_size=200,
+    ...                 nproc=16)
+    >>> cc.prepare_data(input_data_file="path/to/input_data",
+    ...                 output_directory="path/to/output_directory",
+    ...                 output_prefix="output_prefix")
+    >>> all_metrics = cc.validate(model_directory="path/to/model",
+    ...                           prepared_input_data_file=f"path/to/output_directory/{output_prefix}_labeled.dataset",
+    ...                           id_class_dict_file=f"path/to/output_directory/{output_prefix}_id_class_dict.pkl",
+    ...                           output_directory="path/to/output_directory",
+    ...                           output_prefix="output_prefix",
+    ...                           predict=True)
+    >>> cc.plot_conf_mat(conf_mat_dict={"Geneformer": all_metrics["conf_matrix"]},
+    ...                  output_directory="path/to/output_directory",
+    ...                  output_prefix="output_prefix",
+    ...                  custom_class_order=["healthy","disease1","disease2"])
+    >>> cc.plot_predictions(predictions_file=f"path/to/output_directory/datestamp_geneformer_cellClassifier_{output_prefix}/ksplit1/predictions.pkl",
+    ...                     id_class_dict_file=f"path/to/output_directory/{output_prefix}_id_class_dict.pkl",
+    ...                     title="disease",
+    ...                     output_directory="path/to/output_directory",
+    ...                     output_prefix="output_prefix",
+    ...                     custom_class_order=["healthy","disease1","disease2"])
+"""
+import datetime
+import logging
+import os
+import pickle
+import subprocess
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from sklearn.model_selection import StratifiedKFold
+from tqdm.auto import tqdm, trange
+from transformers import Trainer
+from transformers.training_args import TrainingArguments
+from . import DataCollatorForCellClassification, DataCollatorForGeneClassification
+from . import classifier_utils as cu
+from . import evaluation_utils as eu
+from . import perturber_utils as pu
+from .tokenizer import TOKEN_DICTIONARY_FILE
+sns.set()
+logger = logging.getLogger(__name__)
+class Classifier:
+    valid_option_dict = {
+        "classifier": {"cell", "gene"},
+        "cell_state_dict": {None, dict},
+        "gene_class_dict": {None, dict},
+        "filter_data": {None, dict},
+        "rare_threshold": {int, float},
+        "max_ncells": {None, int},
+        "max_ncells_per_class": {None, int},
+        "training_args": {None, dict},
+        "freeze_layers": {int},
+        "num_crossval_splits": {0, 1, 5},
+        "eval_size": {int, float},
+        "no_eval": {bool},
+        "stratify_splits_col": {None, str},
+        "forward_batch_size": {int},
+        "nproc": {int},
+    }
+    def __init__(
+        self,
+        classifier=None,
+        cell_state_dict=None,
+        gene_class_dict=None,
+        filter_data=None,
+        rare_threshold=0,
+        max_ncells=None,
+        max_ncells_per_class=None,
+        training_args=None,
+        freeze_layers=0,
+        num_crossval_splits=1,
+        eval_size=0.2,
+        stratify_splits_col=None,
+        no_eval=False,
+        forward_batch_size=100,
+        nproc=4,
+    ):
+        """
+        Initialize Geneformer classifier.
+        **Parameters:**
+        classifier : {"cell", "gene"}
+            | Whether to fine-tune a cell state or gene classifier.
+        cell_state_dict : None, dict
+            | Cell states to fine-tune model to distinguish.
+            | Two-item dictionary with keys: state_key and states
+            | state_key: key specifying name of column in .dataset that defines the states to model
+            | states: list of values in the state_key column that specifies the states to model
+            | Alternatively, instead of a list of states, can specify "all" to use all states in that state key from input data.
+            | Of note, if using "all", states will be defined after data is filtered.
+            | Must have at least 2 states to model.
+            | For example: {"state_key": "disease",
+            |               "states": ["nf", "hcm", "dcm"]}
+            |               or
+            |               {"state_key": "disease",
+            |               "states": "all"}
+        gene_class_dict : None, dict
+            | Gene classes to fine-tune model to distinguish.
+            | Dictionary in format: {Gene_label_A: list(geneA1, geneA2, ...),
+            |                        Gene_label_B: list(geneB1, geneB2, ...)}
+            | Gene values should be Ensembl IDs.
+        filter_data : None, dict
+            | Default is to fine-tune with all input data.
+            | Otherwise, dictionary specifying .dataset column name and list of values to filter by.
+        rare_threshold : float
+            | Threshold below which rare cell states should be removed.
+            | For example, setting to 0.05 will remove cell states representing
+            | < 5% of the total cells from the cell state classifier's possible classes.
+        max_ncells : None, int
+            | Maximum number of cells to use for fine-tuning.
+            | Default is to fine-tune with all input data.
+        max_ncells_per_class : None, int
+            | Maximum number of cells per cell class to use for fine-tuning.
+            | Of note, will be applied after max_ncells above.
+            | (Only valid for cell classification.)
+        training_args : None, dict
+            | Training arguments for fine-tuning.
+            | If None, defaults will be inferred for 6 layer Geneformer.
+            | Otherwise, will use the Hugging Face defaults:
+            | https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
+            | Note: Hyperparameter tuning is highly recommended, rather than using defaults.
+        freeze_layers : int
+            | Number of layers to freeze from fine-tuning.
+            | 0: no layers will be frozen; 2: first two layers will be frozen; etc.
+        num_crossval_splits : {0, 1, 5}
+            | 0: train on all data without splitting
+            | 1: split data into train and eval sets by designated eval_size
+            | 5: split data into 5 folds of train and eval sets by designated eval_size
+        eval_size : None, float
+            | Proportion of data to hold out for evaluation (e.g. 0.2 if intending 80:20 train/eval split)
+        stratify_splits_col : None, str
+            | Name of column in .dataset to be used for stratified splitting.
+            | Proportion of each class in this column will be the same in the splits as in the original dataset.
+        no_eval : bool
+            | If True, will skip eval step and use all data for training.
+            | Otherwise, will perform eval during training.
+        forward_batch_size : int
+            | Batch size for forward pass (for evaluation, not training).
+        nproc : int
+            | Number of CPU processes to use.
+        """
+        self.classifier = classifier
+        self.cell_state_dict = cell_state_dict
+        self.gene_class_dict = gene_class_dict
+        self.filter_data = filter_data
+        self.rare_threshold = rare_threshold
+        self.max_ncells = max_ncells
+        self.max_ncells_per_class = max_ncells_per_class
+        self.training_args = training_args
+        self.freeze_layers = freeze_layers
+        self.num_crossval_splits = num_crossval_splits
+        self.eval_size = eval_size
+        self.stratify_splits_col = stratify_splits_col
+        self.no_eval = no_eval
+        self.forward_batch_size = forward_batch_size
+        self.nproc = nproc
+        if self.training_args is None:
+            logger.warning(
+                "Hyperparameter tuning is highly recommended for optimal results. "
+                "No training_args provided; using default hyperparameters."
+            )
+        self.validate_options()
+        if self.filter_data is None:
+            self.filter_data = dict()
+        if self.classifier == "cell":
+            if self.cell_state_dict["states"] != "all":
+                self.filter_data[
+                    self.cell_state_dict["state_key"]
+                ] = self.cell_state_dict["states"]
+        # load token dictionary (Ensembl IDs:token)
+        with open(TOKEN_DICTIONARY_FILE, "rb") as f:
+            self.gene_token_dict = pickle.load(f)
+        self.token_gene_dict = {v: k for k, v in self.gene_token_dict.items()}
+        # filter genes for gene classification for those in token dictionary
+        if self.classifier == "gene":
+            all_gene_class_values = set(pu.flatten_list(self.gene_class_dict.values()))
+            missing_genes = [
+                gene
+                for gene in all_gene_class_values
+                if gene not in self.gene_token_dict.keys()
+            ]
+            if len(missing_genes) == len(all_gene_class_values):
+                logger.error(
+                    "None of the provided genes to classify are in token dictionary."
+                )
+                raise
+            elif len(missing_genes) > 0:
+                logger.warning(
+                    f"Genes to classify {missing_genes} are not in token dictionary."
+                )
+            self.gene_class_dict = {
+                k: set([self.gene_token_dict.get(gene) for gene in v])
+                for k, v in self.gene_class_dict.items()
+            }
+            empty_classes = []
+            for k, v in self.gene_class_dict.items():
+                if len(v) == 0:
+                    empty_classes += [k]
+            if len(empty_classes) > 0:
+                logger.error(
+                    f"Class(es) {empty_classes} did not contain any genes in the token dictionary."
+                )
+                raise
+    def validate_options(self):
+        # confirm arguments are within valid options and compatible with each other
+        for attr_name, valid_options in self.valid_option_dict.items():
+            attr_value = self.__dict__[attr_name]
+            if not isinstance(attr_value, (list, dict)):
+                if attr_value in valid_options:
+                    continue
+            valid_type = False
+            for option in valid_options:
+                if (option in [int, float, list, dict, bool]) and isinstance(
+                    attr_value, option
+                ):
+                    valid_type = True
+                    break
+            if valid_type:
+                continue
+            logger.error(
+                f"Invalid option for {attr_name}. "
+                f"Valid options for {attr_name}: {valid_options}"
+            )
+            raise
+        if self.filter_data is not None:
+            for key, value in self.filter_data.items():
+                if not isinstance(value, list):
+                    self.filter_data[key] = [value]
+                    logger.warning(
+                        "Values in filter_data dict must be lists. "
+                        f"Changing {key} value to list ([{value}])."
+                    )
+        if self.classifier == "cell":
+            if set(self.cell_state_dict.keys()) != set(["state_key", "states"]):
+                logger.error(
+                    "Invalid keys for cell_state_dict. "
+                    "The cell_state_dict should have only 2 keys: state_key and states"
+                )
+                raise
+            if self.cell_state_dict["states"] != "all":
+                if not isinstance(self.cell_state_dict["states"], list):
+                    logger.error(
+                        "States in cell_state_dict should be list of states to model."
+                    )
+                    raise
+                if len(self.cell_state_dict["states"]) < 2:
+                    logger.error(
+                        "States in cell_state_dict should contain at least 2 states to classify."
+                    )
+                    raise
+        if self.classifier == "gene":
+            if len(self.gene_class_dict.keys()) < 2:
+                logger.error(
+                    "Gene_class_dict should contain at least 2 gene classes to classify."
+                )
+                raise
+    def prepare_data(
+        self,
+        input_data_file,
+        output_directory,
+        output_prefix,
+        split_id_dict=None,
+        test_size=0,
+        attr_to_split=None,
+        attr_to_balance=None,
+        max_trials=100,
+        pval_threshold=0.1,
+    ):
+        """
+        Prepare data for cell state or gene classification.
+        **Parameters**
+        input_data_file : Path
+            | Path to directory containing .dataset input
+        output_directory : Path
+            | Path to directory where prepared data will be saved
+        output_prefix : str
+            | Prefix for output file
+        split_id_dict : None, dict
+            | Dictionary of IDs for train and test splits
+            | Three-item dictionary with keys: attr_key, train, test
+            | attr_key: key specifying name of column in .dataset that contains the IDs for the data splits
+            | train: list of IDs in the attr_key column to include in the train split
+            | test: list of IDs in the attr_key column to include in the test split
+            | For example: {"attr_key": "individual",
+            |               "train": ["patient1", "patient2", "patient3", "patient4"],
+            |               "test": ["patient5", "patient6"]}
+        test_size : None, float
+            | Proportion of data to be saved separately and held out for test set
+            | (e.g. 0.2 if intending hold out 20%)
+            | The training set will be further split to train / validation in self.validate
+            | Note: only available for CellClassifiers
+        attr_to_split : None, str
+            | Key for attribute on which to split data while balancing potential confounders
+            | e.g. "patient_id" for splitting by patient while balancing other characteristics
+            | Note: only available for CellClassifiers
+        attr_to_balance : None, list
+            | List of attribute keys on which to balance data while splitting on attr_to_split
+            | e.g. ["age", "sex"] for balancing these characteristics while splitting by patient
+            | Note: only available for CellClassifiers
+        max_trials : None, int
+            | Maximum number of trials of random splitting to try to achieve balanced other attributes
+            | If no split is found without significant (p<0.05) differences in other attributes, will select best
+            | Note: only available for CellClassifiers
+        pval_threshold : None, float
+            | P-value threshold to use for attribute balancing across splits
+            | E.g. if set to 0.1, will accept trial if p >= 0.1 for all attributes in attr_to_balance
+        """
+        # prepare data and labels for classification
+        data = pu.load_and_filter(self.filter_data, self.nproc, input_data_file)
+        if self.classifier == "cell":
+            if "label" in data.features:
+                logger.error(
+                    "Column name 'label' must be reserved for class IDs. Please rename column."
+                )
+                raise
+        elif self.classifier == "gene":
+            if "labels" in data.features:
+                logger.error(
+                    "Column name 'labels' must be reserved for class IDs. Please rename column."
+                )
+                raise
+        if self.classifier == "cell":
+            # remove cell states representing < rare_threshold of cells
+            data = cu.remove_rare(
+                data, self.rare_threshold, self.cell_state_dict["state_key"], self.nproc
+            )
+            # downsample max cells and max per class
+            data = cu.downsample_and_shuffle(
+                data, self.max_ncells, self.max_ncells_per_class, self.cell_state_dict
+            )
+            # rename cell state column to "label"
+            data = cu.rename_cols(data, self.cell_state_dict["state_key"])
+        # convert classes to numerical labels and save as id_class_dict
+        # of note, will label all genes in gene_class_dict
+        # if (cross-)validating, genes will be relabeled in column "labels" for each split
+        # at the time of training with Classifier.validate
+        data, id_class_dict = cu.label_classes(
+            self.classifier, data, self.gene_class_dict, self.nproc
+        )
+        # save id_class_dict for future reference
+        id_class_output_path = (
+            Path(output_directory) / f"{output_prefix}_id_class_dict"
+        ).with_suffix(".pkl")
+        with open(id_class_output_path, "wb") as f:
+            pickle.dump(id_class_dict, f)
+        if split_id_dict is not None:
+            data_dict = dict()
+            data_dict["train"] = pu.filter_by_dict(
+                data, {split_id_dict["attr_key"]: split_id_dict["train"]}, self.nproc
+            )
+            data_dict["test"] = pu.filter_by_dict(
+                data, {split_id_dict["attr_key"]: split_id_dict["test"]}, self.nproc
+            )
+            train_data_output_path = (
+                Path(output_directory) / f"{output_prefix}_labeled_train"
+            ).with_suffix(".dataset")
+            test_data_output_path = (
+                Path(output_directory) / f"{output_prefix}_labeled_test"
+            ).with_suffix(".dataset")
+            data_dict["train"].save_to_disk(train_data_output_path)
+            data_dict["test"].save_to_disk(test_data_output_path)
+        elif (test_size is not None) and (self.classifier == "cell"):
+            if 1 > test_size > 0:
+                data_dict, balance_df = cu.balance_attr_splits(
+                    data,
+                    attr_to_split,
+                    attr_to_balance,
+                    test_size,
+                    max_trials,
+                    pval_threshold,
+                    self.cell_state_dict["state_key"],
+                    self.nproc,
+                )
+                balance_df.to_csv(
+                    f"{output_directory}/{output_prefix}_train_test_balance_df.csv"
+                )
+                train_data_output_path = (
+                    Path(output_directory) / f"{output_prefix}_labeled_train"
+                ).with_suffix(".dataset")
+                test_data_output_path = (
+                    Path(output_directory) / f"{output_prefix}_labeled_test"
+                ).with_suffix(".dataset")
+                data_dict["train"].save_to_disk(train_data_output_path)
+                data_dict["test"].save_to_disk(test_data_output_path)
+        else:
+            data_output_path = (
+                Path(output_directory) / f"{output_prefix}_labeled"
+            ).with_suffix(".dataset")
+            data.save_to_disk(data_output_path)
+    def train_all_data(
+        self,
+        model_directory,
+        prepared_input_data_file,
+        id_class_dict_file,
+        output_directory,
+        output_prefix,
+        save_eval_output=True,
+    ):
+        """
+        Train cell state or gene classifier using all data.
+        **Parameters**
+        model_directory : Path
+            | Path to directory containing model
+        prepared_input_data_file : Path
+            | Path to directory containing _labeled.dataset previously prepared by Classifier.prepare_data
+        id_class_dict_file : Path
+            | Path to _id_class_dict.pkl previously prepared by Classifier.prepare_data
+            | (dictionary of format: numerical IDs: class_labels)
+        output_directory : Path
+            | Path to directory where model and eval data will be saved
+        output_prefix : str
+            | Prefix for output files
+        save_eval_output : bool
+            | Whether to save cross-fold eval output
+            | Saves as pickle file of dictionary of eval metrics
+        **Output**
+        Returns trainer after fine-tuning with all data.
+        """
+        ##### Load data and prepare output directory #####
+        # load numerical id to class dictionary (id:class)
+        with open(id_class_dict_file, "rb") as f:
+            id_class_dict = pickle.load(f)
+        class_id_dict = {v: k for k, v in id_class_dict.items()}
+        # load previously filtered and prepared data
+        data = pu.load_and_filter(None, self.nproc, prepared_input_data_file)
+        data = data.shuffle(seed=42)  # reshuffle in case users provide unshuffled data
+        # define output directory path
+        current_date = datetime.datetime.now()
+        datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
+        if output_directory[-1:] != "/":  # add slash for dir if not present
+            output_directory = output_directory + "/"
+        output_dir = f"{output_directory}{datestamp}_geneformer_{self.classifier}Classifier_{output_prefix}/"
+        subprocess.call(f"mkdir {output_dir}", shell=True)
+        # get number of classes for classifier
+        num_classes = cu.get_num_classes(id_class_dict)
+        if self.classifier == "gene":
+            targets = pu.flatten_list(self.gene_class_dict.values())
+            labels = pu.flatten_list(
+                [
+                    [class_id_dict[label]] * len(targets)
+                    for label, targets in self.gene_class_dict.items()
+                ]
+            )
+            assert len(targets) == len(labels)
+            data = cu.prep_gene_classifier_all_data(
+                data, targets, labels, self.max_ncells, self.nproc
+            )
+        trainer = self.train_classifier(
+            model_directory, num_classes, data, None, output_dir
+        )
+        return trainer
+    def validate(
+        self,
+        model_directory,
+        prepared_input_data_file,
+        id_class_dict_file,
+        output_directory,
+        output_prefix,
+        split_id_dict=None,
+        attr_to_split=None,
+        attr_to_balance=None,
+        max_trials=100,
+        pval_threshold=0.1,
+        save_eval_output=True,
+        predict_eval=True,
+        predict_trainer=False,
+    ):
+        """
+        (Cross-)validate cell state or gene classifier.
+        **Parameters**
+        model_directory : Path
+            | Path to directory containing model
+        prepared_input_data_file : Path
+            | Path to directory containing _labeled.dataset previously prepared by Classifier.prepare_data
+        id_class_dict_file : Path
+            | Path to _id_class_dict.pkl previously prepared by Classifier.prepare_data
+            | (dictionary of format: numerical IDs: class_labels)
+        output_directory : Path
+            | Path to directory where model and eval data will be saved
+        output_prefix : str
+            | Prefix for output files
+        split_id_dict : None, dict
+            | Dictionary of IDs for train and eval splits
+            | Three-item dictionary with keys: attr_key, train, eval
+            | attr_key: key specifying name of column in .dataset that contains the IDs for the data splits
+            | train: list of IDs in the attr_key column to include in the train split
+            | eval: list of IDs in the attr_key column to include in the eval split
+            | For example: {"attr_key": "individual",
+            |               "train": ["patient1", "patient2", "patient3", "patient4"],
+            |               "eval": ["patient5", "patient6"]}
+            | Note: only available for CellClassifiers with 1-fold split (self.classifier="cell"; self.num_crossval_splits=1)
+        attr_to_split : None, str
+            | Key for attribute on which to split data while balancing potential confounders
+            | e.g. "patient_id" for splitting by patient while balancing other characteristics
+            | Note: only available for CellClassifiers with 1-fold split (self.classifier="cell"; self.num_crossval_splits=1)
+        attr_to_balance : None, list
+            | List of attribute keys on which to balance data while splitting on attr_to_split
+            | e.g. ["age", "sex"] for balancing these characteristics while splitting by patient
+        max_trials : None, int
+            | Maximum number of trials of random splitting to try to achieve balanced other attribute
+            | If no split is found without significant (p < pval_threshold) differences in other attributes, will select best
+        pval_threshold : None, float
+            | P-value threshold to use for attribute balancing across splits
+            | E.g. if set to 0.1, will accept trial if p >= 0.1 for all attributes in attr_to_balance
+        save_eval_output : bool
+            | Whether to save cross-fold eval output
+            | Saves as pickle file of dictionary of eval metrics
+        predict_eval : bool
+            | Whether or not to save eval predictions
+            | Saves as a pickle file of self.evaluate predictions
+        predict_trainer : bool
+            | Whether or not to save eval predictions from trainer
+            | Saves as a pickle file of trainer predictions
+        """
+        if self.num_crossval_splits == 0:
+            logger.error("num_crossval_splits must be 1 or 5 to validate.")
+            raise
+        # ensure number of genes in each class is > 5 if validating model
+        if self.classifier == "gene":
+            insuff_classes = [k for k, v in self.gene_class_dict.items() if len(v) < 5]
+            if (self.num_crossval_splits > 0) and (len(insuff_classes) > 0):
+                logger.error(
+                    f"Insufficient # of members in class(es) {insuff_classes} to (cross-)validate."
+                )
+                raise
+        ##### Load data and prepare output directory #####
+        # load numerical id to class dictionary (id:class)
+        with open(id_class_dict_file, "rb") as f:
+            id_class_dict = pickle.load(f)
+        class_id_dict = {v: k for k, v in id_class_dict.items()}
+        # load previously filtered and prepared data
+        data = pu.load_and_filter(None, self.nproc, prepared_input_data_file)
+        data = data.shuffle(seed=42)  # reshuffle in case users provide unshuffled data
+        # define output directory path
+        current_date = datetime.datetime.now()
+        datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
+        if output_directory[-1:] != "/":  # add slash for dir if not present
+            output_directory = output_directory + "/"
+        output_dir = f"{output_directory}{datestamp}_geneformer_{self.classifier}Classifier_{output_prefix}/"
+        subprocess.call(f"mkdir {output_dir}", shell=True)
+        # get number of classes for classifier
+        num_classes = cu.get_num_classes(id_class_dict)
+        ##### (Cross-)validate the model #####
+        results = []
+        all_conf_mat = np.zeros((num_classes, num_classes))
+        iteration_num = 1
+        if self.classifier == "cell":
+            for i in trange(self.num_crossval_splits):
+                print(
+                    f"****** Validation split: {iteration_num}/{self.num_crossval_splits} ******\n"
+                )
+                ksplit_output_dir = os.path.join(output_dir, f"ksplit{iteration_num}")
+                if self.num_crossval_splits == 1:
+                    # single 1-eval_size:eval_size split
+                    if split_id_dict is not None:
+                        data_dict = dict()
+                        data_dict["train"] = pu.filter_by_dict(
+                            data,
+                            {split_id_dict["attr_key"]: split_id_dict["train"]},
+                            self.nproc,
+                        )
+                        data_dict["test"] = pu.filter_by_dict(
+                            data,
+                            {split_id_dict["attr_key"]: split_id_dict["eval"]},
+                            self.nproc,
+                        )
+                    elif attr_to_split is not None:
+                        data_dict, balance_df = cu.balance_attr_splits(
+                            data,
+                            attr_to_split,
+                            attr_to_balance,
+                            self.eval_size,
+                            max_trials,
+                            pval_threshold,
+                            self.cell_state_dict["state_key"],
+                            self.nproc,
+                        )
+                        balance_df.to_csv(
+                            f"{output_dir}/{output_prefix}_train_valid_balance_df.csv"
+                        )
+                    else:
+                        data_dict = data.train_test_split(
+                            test_size=self.eval_size,
+                            stratify_by_column=self.stratify_splits_col,
+                            seed=42,
+                        )
+                    train_data = data_dict["train"]
+                    eval_data = data_dict["test"]
+                else:
+                    # 5-fold cross-validate
+                    num_cells = len(data)
+                    fifth_cells = num_cells * 0.2
+                    num_eval = min((self.eval_size * num_cells), fifth_cells)
+                    start = i * fifth_cells
+                    end = start + num_eval
+                    eval_indices = [j for j in range(start, end)]
+                    train_indices = [
+                        j for j in range(num_cells) if j not in eval_indices
+                    ]
+                    eval_data = data.select(eval_indices)
+                    train_data = data.select(train_indices)
+                trainer = self.train_classifier(
+                    model_directory,
+                    num_classes,
+                    train_data,
+                    eval_data,
+                    ksplit_output_dir,
+                    predict_trainer,
+                )
+                result = self.evaluate_model(
+                    trainer.model,
+                    num_classes,
+                    id_class_dict,
+                    eval_data,
+                    predict_eval,
+                    ksplit_output_dir,
+                    output_prefix,
+                )
+                results += [result]
+                all_conf_mat = all_conf_mat + result["conf_mat"]
+                iteration_num = iteration_num + 1
+        elif self.classifier == "gene":
+            # set up (cross-)validation splits
+            targets = pu.flatten_list(self.gene_class_dict.values())
+            labels = pu.flatten_list(
+                [
+                    [class_id_dict[label]] * len(targets)
+                    for label, targets in self.gene_class_dict.items()
+                ]
+            )
+            assert len(targets) == len(labels)
+            n_splits = int(1 / self.eval_size)
+            skf = StratifiedKFold(n_splits=n_splits, random_state=0, shuffle=True)
+            # (Cross-)validate
+            for train_index, eval_index in tqdm(skf.split(targets, labels)):
+                print(
+                    f"****** Validation split: {iteration_num}/{self.num_crossval_splits} ******\n"
+                )
+                ksplit_output_dir = os.path.join(output_dir, f"ksplit{iteration_num}")
+                # filter data for examples containing classes for this split
+                # subsample to max_ncells and relabel data in column "labels"
+                train_data, eval_data = cu.prep_gene_classifier_split(
+                    data,
+                    targets,
+                    labels,
+                    train_index,
+                    eval_index,
+                    self.max_ncells,
+                    iteration_num,
+                    self.nproc,
+                )
+                trainer = self.train_classifier(
+                    model_directory,
+                    num_classes,
+                    train_data,
+                    eval_data,
+                    ksplit_output_dir,
+                    predict_trainer,
+                )
+                result = self.evaluate_model(
+                    trainer.model,
+                    num_classes,
+                    id_class_dict,
+                    eval_data,
+                    predict_eval,
+                    ksplit_output_dir,
+                    output_prefix,
+                )
+                results += [result]
+                all_conf_mat = all_conf_mat + result["conf_mat"]
+                # break after 1 or 5 splits, each with train/eval proportions dictated by eval_size
+                if iteration_num == self.num_crossval_splits:
+                    break
+                iteration_num = iteration_num + 1
+        all_conf_mat_df = pd.DataFrame(
+            all_conf_mat, columns=id_class_dict.values(), index=id_class_dict.values()
+        )
+        all_metrics = {
+            "conf_matrix": all_conf_mat_df,
+            "macro_f1": [result["macro_f1"] for result in results],
+            "acc": [result["acc"] for result in results],
+        }
+        all_roc_metrics = None  # roc metrics not reported for multiclass
+        if num_classes == 2:
+            mean_fpr = np.linspace(0, 1, 100)
+            all_tpr = [result["roc_metrics"]["interp_tpr"] for result in results]
+            all_roc_auc = [result["roc_metrics"]["auc"] for result in results]
+            all_tpr_wt = [result["roc_metrics"]["tpr_wt"] for result in results]
+            mean_tpr, roc_auc, roc_auc_sd = eu.get_cross_valid_roc_metrics(
+                all_tpr, all_roc_auc, all_tpr_wt
+            )
+            all_roc_metrics = {
+                "mean_tpr": mean_tpr,
+                "mean_fpr": mean_fpr,
+                "all_roc_auc": all_roc_auc,
+                "roc_auc": roc_auc,
+                "roc_auc_sd": roc_auc_sd,
+            }
+        all_metrics["all_roc_metrics"] = all_roc_metrics
+        if save_eval_output is True:
+            eval_metrics_output_path = (
+                Path(output_dir) / f"{output_prefix}_eval_metrics_dict"
+            ).with_suffix(".pkl")
+            with open(eval_metrics_output_path, "wb") as f:
+                pickle.dump(all_metrics, f)
+        return all_metrics
+    def train_classifier(
+        self,
+        model_directory,
+        num_classes,
+        train_data,
+        eval_data,
+        output_directory,
+        predict=False,
+    ):
+        """
+        Fine-tune model for cell state or gene classification.
+        **Parameters**
+        model_directory : Path
+            | Path to directory containing model
+        num_classes : int
+            | Number of classes for classifier
+        train_data : Dataset
+            | Loaded training .dataset input
+            | For cell classifier, labels in column "label".
+            | For gene classifier, labels in column "labels".
+        eval_data : None, Dataset
+            | (Optional) Loaded evaluation .dataset input
+            | For cell classifier, labels in column "label".
+            | For gene classifier, labels in column "labels".
+        output_directory : Path
+            | Path to directory where fine-tuned model will be saved
+        predict : bool
+            | Whether or not to save eval predictions from trainer
+        """
+        ##### Validate and prepare data #####
+        train_data, eval_data = cu.validate_and_clean_cols(
+            train_data, eval_data, self.classifier
+        )
+        if (self.no_eval is True) and (eval_data is not None):
+            logger.warning(
+                "no_eval set to True; model will be trained without evaluation."
+            )
+            eval_data = None
+        if (self.classifier == "gene") and (predict is True):
+            logger.warning(
+                "Predictions during training not currently available for gene classifiers; setting predict to False."
+            )
+            predict = False
+        # ensure not overwriting previously saved model
+        saved_model_test = os.path.join(output_directory, "pytorch_model.bin")
+        if os.path.isfile(saved_model_test) is True:
+            logger.error("Model already saved to this designated output directory.")
+            raise
+        # make output directory
+        subprocess.call(f"mkdir {output_directory}", shell=True)
+        ##### Load model and training args #####
+        if self.classifier == "cell":
+            model_type = "CellClassifier"
+        elif self.classifier == "gene":
+            model_type = "GeneClassifier"
+        model = pu.load_model(model_type, num_classes, model_directory, "train")
+        def_training_args, def_freeze_layers = cu.get_default_train_args(
+            model, self.classifier, train_data, output_directory
+        )
+        if self.training_args is not None:
+            def_training_args.update(self.training_args)
+        logging_steps = round(
+            len(train_data) / def_training_args["per_device_train_batch_size"] / 10
+        )
+        def_training_args["logging_steps"] = logging_steps
+        def_training_args["output_dir"] = output_directory
+        if eval_data is None:
+            def_training_args["evaluation_strategy"] = "no"
+            def_training_args["load_best_model_at_end"] = False
+        training_args_init = TrainingArguments(**def_training_args)
+        if self.freeze_layers is not None:
+            def_freeze_layers = self.freeze_layers
+        if def_freeze_layers > 0:
+            modules_to_freeze = model.bert.encoder.layer[:def_freeze_layers]
+            for module in modules_to_freeze:
+                for param in module.parameters():
+                    param.requires_grad = False
+        ##### Fine-tune the model #####
+        # define the data collator
+        if self.classifier == "cell":
+            data_collator = DataCollatorForCellClassification()
+        elif self.classifier == "gene":
+            data_collator = DataCollatorForGeneClassification()
+        # create the trainer
+        trainer = Trainer(
+            model=model,
+            args=training_args_init,
+            data_collator=data_collator,
+            train_dataset=train_data,
+            eval_dataset=eval_data,
+            compute_metrics=cu.compute_metrics,
+        )
+        # train the classifier
+        trainer.train()
+        trainer.save_model(output_directory)
+        if predict is True:
+            # make eval predictions and save predictions and metrics
+            predictions = trainer.predict(eval_data)
+            prediction_output_path = f"{output_directory}/predictions.pkl"
+            with open(prediction_output_path, "wb") as f:
+                pickle.dump(predictions, f)
+            trainer.save_metrics("eval", predictions.metrics)
+        return trainer
+    def evaluate_model(
+        self,
+        model,
+        num_classes,
+        id_class_dict,
+        eval_data,
+        predict=False,
+        output_directory=None,
+        output_prefix=None,
+    ):
+        """
+        Evaluate the fine-tuned model.
+        **Parameters**
+        model : nn.Module
+            | Loaded fine-tuned model (e.g. trainer.model)
+        num_classes : int
+            | Number of classes for classifier
+        id_class_dict : dict
+            | Loaded _id_class_dict.pkl previously prepared by Classifier.prepare_data
+            | (dictionary of format: numerical IDs: class_labels)
+        eval_data : Dataset
+            | Loaded evaluation .dataset input
+        predict : bool
+            | Whether or not to save eval predictions
+        output_directory : Path
+            | Path to directory where eval data will be saved
+        output_prefix : str
+            | Prefix for output files
+        """
+        ##### Evaluate the model #####
+        labels = id_class_dict.keys()
+        y_pred, y_true, logits_list = eu.classifier_predict(
+            model, self.classifier, eval_data, self.forward_batch_size
+        )
+        conf_mat, macro_f1, acc, roc_metrics = eu.get_metrics(
+            y_pred, y_true, logits_list, num_classes, labels
+        )
+        if predict is True:
+            pred_dict = {
+                "pred_ids": y_pred,
+                "label_ids": y_true,
+                "predictions": logits_list,
+            }
+            pred_dict_output_path = (
+                Path(output_directory) / f"{output_prefix}_pred_dict"
+            ).with_suffix(".pkl")
+            with open(pred_dict_output_path, "wb") as f:
+                pickle.dump(pred_dict, f)
+        return {
+            "conf_mat": conf_mat,
+            "macro_f1": macro_f1,
+            "acc": acc,
+            "roc_metrics": roc_metrics,
+        }
+    def evaluate_saved_model(
+        self,
+        model_directory,
+        id_class_dict_file,
+        test_data_file,
+        output_directory,
+        output_prefix,
+        predict=True,
+    ):
+        """
+        Evaluate the fine-tuned model.
+        **Parameters**
+        model_directory : Path
+            | Path to directory containing model
+        id_class_dict_file : Path
+            | Path to _id_class_dict.pkl previously prepared by Classifier.prepare_data
+            | (dictionary of format: numerical IDs: class_labels)
+        test_data_file : Path
+            | Path to directory containing test .dataset
+        output_directory : Path
+            | Path to directory where eval data will be saved
+        output_prefix : str
+            | Prefix for output files
+        predict : bool
+            | Whether or not to save eval predictions
+        """
+        # load numerical id to class dictionary (id:class)
+        with open(id_class_dict_file, "rb") as f:
+            id_class_dict = pickle.load(f)
+        # get number of classes for classifier
+        num_classes = cu.get_num_classes(id_class_dict)
+        # load previously filtered and prepared data
+        test_data = pu.load_and_filter(None, self.nproc, test_data_file)
+        # load previously fine-tuned model
+        if self.classifier == "cell":
+            model_type = "CellClassifier"
+        elif self.classifier == "gene":
+            model_type = "GeneClassifier"
+        model = pu.load_model(model_type, num_classes, model_directory, "eval")
+        # evaluate the model
+        results = self.evaluate_model(
+            model,
+            num_classes,
+            id_class_dict,
+            test_data,
+            predict=predict,
+            output_directory=output_directory,
+            output_prefix=output_prefix,
+        )
+        all_conf_mat_df = pd.DataFrame(
+            results["conf_mat"],
+            columns=id_class_dict.values(),
+            index=id_class_dict.values(),
+        )
+        all_metrics = {
+            "conf_matrix": all_conf_mat_df,
+            "macro_f1": results["macro_f1"],
+            "acc": results["acc"],
+        }
+        all_roc_metrics = None  # roc metrics not reported for multiclass
+        if num_classes == 2:
+            mean_fpr = np.linspace(0, 1, 100)
+            all_tpr = [result["roc_metrics"]["interp_tpr"] for result in results]
+            all_roc_auc = [result["roc_metrics"]["auc"] for result in results]
+            all_tpr_wt = [result["roc_metrics"]["tpr_wt"] for result in results]
+            mean_tpr, roc_auc, roc_auc_sd = eu.get_cross_valid_roc_metrics(
+                all_tpr, all_roc_auc, all_tpr_wt
+            )
+            all_roc_metrics = {
+                "mean_tpr": mean_tpr,
+                "mean_fpr": mean_fpr,
+                "all_roc_auc": all_roc_auc,
+            }
+        all_metrics["all_roc_metrics"] = all_roc_metrics
+        test_metrics_output_path = (
+            Path(output_directory) / f"{output_prefix}_test_metrics_dict"
+        ).with_suffix(".pkl")
+        with open(test_metrics_output_path, "wb") as f:
+            pickle.dump(all_metrics, f)
+        return all_metrics
+    def plot_conf_mat(
+        self,
+        conf_mat_dict,
+        output_directory,
+        output_prefix,
+        custom_class_order=None,
+    ):
+        """
+        Plot confusion matrix results of evaluating the fine-tuned model.
+        **Parameters**
+        conf_mat_dict : dict
+            | Dictionary of model_name : confusion_matrix_DataFrame
+            | (all_metrics["conf_matrix"] from self.validate)
+        output_directory : Path
+            | Path to directory where plots will be saved
+        output_prefix : str
+            | Prefix for output file
+        custom_class_order : None, list
+            | List of classes in custom order for plots.
+            | Same order will be used for all models.
+        """
+        for model_name in conf_mat_dict.keys():
+            eu.plot_confusion_matrix(
+                conf_mat_dict[model_name],
+                model_name,
+                output_directory,
+                output_prefix,
+                custom_class_order,
+            )
+    def plot_roc(
+        self,
+        roc_metric_dict,
+        model_style_dict,
+        title,
+        output_directory,
+        output_prefix,
+    ):
+        """
+        Plot ROC curve results of evaluating the fine-tuned model.
+        **Parameters**
+        roc_metric_dict : dict
+            | Dictionary of model_name : roc_metrics
+            | (all_metrics["all_roc_metrics"] from self.validate)
+        model_style_dict : dict[dict]
+            | Dictionary of model_name : dictionary of style_attribute : style
+            | where style includes color and linestyle
+            | e.g. {'Model_A': {'color': 'black', 'linestyle': '-'}, 'Model_B': ...}
+        title : str
+            | Title of plot (e.g. 'Dosage-sensitive vs -insensitive factors')
+        output_directory : Path
+            | Path to directory where plots will be saved
+        output_prefix : str
+            | Prefix for output file
+        """
+        eu.plot_ROC(
+            roc_metric_dict, model_style_dict, title, output_directory, output_prefix
+        )
+    def plot_predictions(
+        self,
+        predictions_file,
+        id_class_dict_file,
+        title,
+        output_directory,
+        output_prefix,
+        custom_class_order=None,
+        kwargs_dict=None,
+    ):
+        """
+        Plot prediction results of evaluating the fine-tuned model.
+        **Parameters**
+        predictions_file : path
+            | Path of model predictions output to plot
+            | (saved output from self.validate if predict=True)
+            | (or saved output from self.evaluate_saved_model)
+        id_class_dict_file : Path
+            | Path to _id_class_dict.pkl previously prepared by Classifier.prepare_data
+            | (dictionary of format: numerical IDs: class_labels)
+        title : str
+            | Title for legend containing class labels.
+        output_directory : Path
+            | Path to directory where plots will be saved
+        output_prefix : str
+            | Prefix for output file
+        custom_class_order : None, list
+            | List of classes in custom order for plots.
+            | Same order will be used for all models.
+        kwargs_dict : None, dict
+            | Dictionary of kwargs to pass to plotting function.
+        """
+        # load predictions
+        with open(predictions_file, "rb") as f:
+            predictions = pickle.load(f)
+        # load numerical id to class dictionary (id:class)
+        with open(id_class_dict_file, "rb") as f:
+            id_class_dict = pickle.load(f)
+        if isinstance(predictions, dict):
+            if all(
+                [
+                    key in predictions.keys()
+                    for key in ["pred_ids", "label_ids", "predictions"]
+                ]
+            ):
+                # format is output from self.evaluate_saved_model
+                predictions_logits = np.array(predictions["predictions"])
+                true_ids = predictions["label_ids"]
+        else:
+            # format is output from self.validate if predict=True
+            predictions_logits = predictions.predictions
+            true_ids = predictions.label_ids
+        num_classes = len(id_class_dict.keys())
+        num_predict_classes = predictions_logits.shape[1]
+        assert num_classes == num_predict_classes
+        classes = id_class_dict.values()
+        true_labels = [id_class_dict[idx] for idx in true_ids]
+        predictions_df = pd.DataFrame(predictions_logits, columns=classes)
+        if custom_class_order is not None:
+            predictions_df = predictions_df.reindex(columns=custom_class_order)
+        predictions_df["true"] = true_labels
+        custom_dict = dict(zip(classes, [i for i in range(len(classes))]))
+        if custom_class_order is not None:
+            custom_dict = dict(
+                zip(custom_class_order, [i for i in range(len(custom_class_order))])
+            )
+        predictions_df = predictions_df.sort_values(
+            by=["true"], key=lambda x: x.map(custom_dict)
+        )
+        eu.plot_predictions(
+            predictions_df, title, output_directory, output_prefix, kwargs_dict
+        )

geneformer/classifier_utils.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import logging
+import random
+from collections import Counter, defaultdict
+import numpy as np
+import pandas as pd
+from scipy.stats import chisquare, ranksums
+from sklearn.metrics import accuracy_score, f1_score
+from . import perturber_utils as pu
+logger = logging.getLogger(__name__)
+def downsample_and_shuffle(data, max_ncells, max_ncells_per_class, cell_state_dict):
+    data = data.shuffle(seed=42)
+    num_cells = len(data)
+    # if max number of cells is defined, then subsample to this max number
+    if max_ncells is not None:
+        if num_cells > max_ncells:
+            data = data.select([i for i in range(max_ncells)])
+    if max_ncells_per_class is not None:
+        class_labels = data[cell_state_dict["state_key"]]
+        random.seed(42)
+        subsample_indices = subsample_by_class(class_labels, max_ncells_per_class)
+        data = data.select(subsample_indices)
+    return data
+# subsample labels to maximum number N per class and return indices
+def subsample_by_class(labels, N):
+    label_indices = defaultdict(list)
+    # Gather indices for each label
+    for idx, label in enumerate(labels):
+        label_indices[label].append(idx)
+    selected_indices = []
+    # Select up to N indices for each label
+    for label, indices in label_indices.items():
+        if len(indices) > N:
+            selected_indices.extend(random.sample(indices, N))
+        else:
+            selected_indices.extend(indices)
+    return selected_indices
+def rename_cols(data, state_key):
+    data = data.rename_column(state_key, "label")
+    return data
+def validate_and_clean_cols(train_data, eval_data, classifier):
+    # validate that data has expected label column and remove others
+    if classifier == "cell":
+        label_col = "label"
+    elif classifier == "gene":
+        label_col = "labels"
+    cols_to_keep = [label_col] + ["input_ids", "length"]
+    if label_col not in train_data.column_names:
+        logger.error(f"train_data must contain column {label_col} with class labels.")
+        raise
+    else:
+        train_data = remove_cols(train_data, cols_to_keep)
+    if eval_data is not None:
+        if label_col not in eval_data.column_names:
+            logger.error(
+                f"eval_data must contain column {label_col} with class labels."
+            )
+            raise
+        else:
+            eval_data = remove_cols(eval_data, cols_to_keep)
+    return train_data, eval_data
+def remove_cols(data, cols_to_keep):
+    other_cols = list(data.features.keys())
+    other_cols = [ele for ele in other_cols if ele not in cols_to_keep]
+    data = data.remove_columns(other_cols)
+    return data
+def remove_rare(data, rare_threshold, label, nproc):
+    if rare_threshold > 0:
+        total_cells = len(data)
+        label_counter = Counter(data[label])
+        nonrare_label_dict = {
+            label: [k for k, v in label_counter if (v / total_cells) > rare_threshold]
+        }
+        data = pu.filter_by_dict(data, nonrare_label_dict, nproc)
+    return data
+def label_classes(classifier, data, gene_class_dict, nproc):
+    if classifier == "cell":
+        label_set = set(data["label"])
+    elif classifier == "gene":
+        # remove cells without any of the target genes
+        def if_contains_label(example):
+            a = pu.flatten_list(gene_class_dict.values())
+            b = example["input_ids"]
+            return not set(a).isdisjoint(b)
+        data = data.filter(if_contains_label, num_proc=nproc)
+        label_set = gene_class_dict.keys()
+        if len(data) == 0:
+            logger.error(
+                "No cells remain after filtering for target genes. Check target gene list."
+            )
+            raise
+    class_id_dict = dict(zip(label_set, [i for i in range(len(label_set))]))
+    id_class_dict = {v: k for k, v in class_id_dict.items()}
+    def classes_to_ids(example):
+        if classifier == "cell":
+            example["label"] = class_id_dict[example["label"]]
+        elif classifier == "gene":
+            example["labels"] = label_gene_classes(
+                example, class_id_dict, gene_class_dict
+            )
+        return example
+    data = data.map(classes_to_ids, num_proc=nproc)
+    return data, id_class_dict
+def label_gene_classes(example, class_id_dict, gene_class_dict):
+    return [
+        class_id_dict.get(gene_class_dict.get(token_id, -100), -100)
+        for token_id in example["input_ids"]
+    ]
+def prep_gene_classifier_split(
+    data, targets, labels, train_index, eval_index, max_ncells, iteration_num, num_proc
+):
+    # generate cross-validation splits
+    targets = np.array(targets)
+    labels = np.array(labels)
+    targets_train, targets_eval = targets[train_index], targets[eval_index]
+    labels_train, labels_eval = labels[train_index], labels[eval_index]
+    label_dict_train = dict(zip(targets_train, labels_train))
+    label_dict_eval = dict(zip(targets_eval, labels_eval))
+    # function to filter by whether contains train or eval labels
+    def if_contains_train_label(example):
+        a = targets_train
+        b = example["input_ids"]
+        return not set(a).isdisjoint(b)
+    def if_contains_eval_label(example):
+        a = targets_eval
+        b = example["input_ids"]
+        return not set(a).isdisjoint(b)
+    # filter dataset for examples containing classes for this split
+    logger.info(f"Filtering training data for genes in split {iteration_num}")
+    train_data = data.filter(if_contains_train_label, num_proc=num_proc)
+    logger.info(
+        f"Filtered {round((1-len(train_data)/len(data))*100)}%; {len(train_data)} remain\n"
+    )
+    logger.info(f"Filtering evalation data for genes in split {iteration_num}")
+    eval_data = data.filter(if_contains_eval_label, num_proc=num_proc)
+    logger.info(
+        f"Filtered {round((1-len(eval_data)/len(data))*100)}%; {len(eval_data)} remain\n"
+    )
+    # subsample to max_ncells
+    train_data = downsample_and_shuffle(train_data, max_ncells, None, None)
+    eval_data = downsample_and_shuffle(eval_data, max_ncells, None, None)
+    # relabel genes for this split
+    def train_classes_to_ids(example):
+        example["labels"] = [
+            label_dict_train.get(token_id, -100) for token_id in example["input_ids"]
+        ]
+        return example
+    def eval_classes_to_ids(example):
+        example["labels"] = [
+            label_dict_eval.get(token_id, -100) for token_id in example["input_ids"]
+        ]
+        return example
+    train_data = train_data.map(train_classes_to_ids, num_proc=num_proc)
+    eval_data = eval_data.map(eval_classes_to_ids, num_proc=num_proc)
+    return train_data, eval_data
+def prep_gene_classifier_all_data(data, targets, labels, max_ncells, num_proc):
+    targets = np.array(targets)
+    labels = np.array(labels)
+    label_dict_train = dict(zip(targets, labels))
+    # function to filter by whether contains train labels
+    def if_contains_train_label(example):
+        a = targets
+        b = example["input_ids"]
+        return not set(a).isdisjoint(b)
+    # filter dataset for examples containing classes for this split
+    logger.info("Filtering training data for genes to classify.")
+    train_data = data.filter(if_contains_train_label, num_proc=num_proc)
+    logger.info(
+        f"Filtered {round((1-len(train_data)/len(data))*100)}%; {len(train_data)} remain\n"
+    )
+    # subsample to max_ncells
+    train_data = downsample_and_shuffle(train_data, max_ncells, None, None)
+    # relabel genes for this split
+    def train_classes_to_ids(example):
+        example["labels"] = [
+            label_dict_train.get(token_id, -100) for token_id in example["input_ids"]
+        ]
+        return example
+    train_data = train_data.map(train_classes_to_ids, num_proc=num_proc)
+    return train_data
+def balance_attr_splits(
+    data,
+    attr_to_split,
+    attr_to_balance,
+    eval_size,
+    max_trials,
+    pval_threshold,
+    state_key,
+    nproc,
+):
+    metadata_df = pd.DataFrame({"split_attr_ids": data[attr_to_split]})
+    for attr in attr_to_balance:
+        if attr == state_key:
+            metadata_df[attr] = data["label"]
+        else:
+            metadata_df[attr] = data[attr]
+    metadata_df = metadata_df.drop_duplicates()
+    split_attr_ids = list(metadata_df["split_attr_ids"])
+    assert len(split_attr_ids) == len(set(split_attr_ids))
+    eval_num = round(len(split_attr_ids) * eval_size)
+    colnames = (
+        ["trial_num", "train_ids", "eval_ids"]
+        + pu.flatten_list(
+            [
+                [
+                    f"{attr}_train_mean_or_counts",
+                    f"{attr}_eval_mean_or_counts",
+                    f"{attr}_pval",
+                ]
+                for attr in attr_to_balance
+            ]
+        )
+        + ["mean_pval"]
+    )
+    balance_df = pd.DataFrame(columns=colnames)
+    data_dict = dict()
+    trial_num = 1
+    for i in range(max_trials):
+        if not all(
+            count > 1 for count in list(Counter(metadata_df[state_key]).values())
+        ):
+            logger.error(
+                f"Cannot balance by {attr_to_split} while retaining at least 1 occurrence of each {state_key} class in both data splits. "
+            )
+            raise
+        eval_base = []
+        for state in set(metadata_df[state_key]):
+            eval_base += list(
+                metadata_df.loc[
+                    metadata_df[state_key][metadata_df[state_key].eq(state)]
+                    .sample(1, random_state=i)
+                    .index
+                ]["split_attr_ids"]
+            )
+        non_eval_base = [idx for idx in split_attr_ids if idx not in eval_base]
+        random.seed(i)
+        eval_ids = random.sample(non_eval_base, eval_num - len(eval_base)) + eval_base
+        train_ids = [idx for idx in split_attr_ids if idx not in eval_ids]
+        df_vals = [trial_num, train_ids, eval_ids]
+        pvals = []
+        for attr in attr_to_balance:
+            train_attr = list(
+                metadata_df[metadata_df["split_attr_ids"].isin(train_ids)][attr]
+            )
+            eval_attr = list(
+                metadata_df[metadata_df["split_attr_ids"].isin(eval_ids)][attr]
+            )
+            if attr == state_key:
+                # ensure IDs are interpreted as categorical
+                train_attr = [str(item) for item in train_attr]
+                eval_attr = [str(item) for item in eval_attr]
+            if all(isinstance(item, (int, float)) for item in train_attr + eval_attr):
+                train_attr_mean = np.nanmean(train_attr)
+                eval_attr_mean = np.nanmean(eval_attr)
+                pval = ranksums(train_attr, eval_attr, nan_policy="omit").pvalue
+                df_vals += [train_attr_mean, eval_attr_mean, pval]
+            elif all(isinstance(item, (str)) for item in train_attr + eval_attr):
+                obs_counts = Counter(train_attr)
+                exp_counts = Counter(eval_attr)
+                all_categ = set(obs_counts.keys()).union(set(exp_counts.keys()))
+                obs = [obs_counts[cat] for cat in all_categ]
+                exp = [
+                    exp_counts[cat] * sum(obs) / sum(exp_counts.values())
+                    for cat in all_categ
+                ]
+                chisquare(f_obs=obs, f_exp=exp).pvalue
+                train_attr_counts = str(obs_counts).strip("Counter(").strip(")")
+                eval_attr_counts = str(exp_counts).strip("Counter(").strip(")")
+                df_vals += [train_attr_counts, eval_attr_counts, pval]
+            else:
+                logger.error(
+                    f"Inconsistent data types in attribute {attr}. "
+                    "Cannot infer if continuous or categorical. "
+                    "Must be all numeric (continuous) or all strings (categorical) to balance."
+                )
+                raise
+            pvals += [pval]
+        df_vals += [np.nanmean(pvals)]
+        balance_df_i = pd.DataFrame(df_vals, index=colnames).T
+        balance_df = pd.concat([balance_df, balance_df_i], ignore_index=True)
+        valid_pvals = [
+            pval_i
+            for pval_i in pvals
+            if isinstance(pval_i, (int, float)) and not np.isnan(pval_i)
+        ]
+        if all(i >= pval_threshold for i in valid_pvals):
+            data_dict["train"] = pu.filter_by_dict(
+                data, {attr_to_split: balance_df_i["train_ids"][0]}, nproc
+            )
+            data_dict["test"] = pu.filter_by_dict(
+                data, {attr_to_split: balance_df_i["eval_ids"][0]}, nproc
+            )
+            return data_dict, balance_df
+        trial_num = trial_num + 1
+    balance_max_df = balance_df.iloc[balance_df["mean_pval"].idxmax(), :]
+    data_dict["train"] = pu.filter_by_dict(
+        data, {attr_to_split: balance_df_i["train_ids"][0]}, nproc
+    )
+    data_dict["test"] = pu.filter_by_dict(
+        data, {attr_to_split: balance_df_i["eval_ids"][0]}, nproc
+    )
+    logger.warning(
+        f"No splits found without significant difference in attr_to_balance among {max_trials} trials. "
+        f"Selecting optimal split (trial #{balance_max_df['trial_num']}) from completed trials."
+    )
+    return data_dict, balance_df
+def get_num_classes(id_class_dict):
+    return len(set(id_class_dict.values()))
+def compute_metrics(pred):
+    labels = pred.label_ids
+    preds = pred.predictions.argmax(-1)
+    # calculate accuracy and macro f1 using sklearn's function
+    acc = accuracy_score(labels, preds)
+    macro_f1 = f1_score(labels, preds, average="macro")
+    return {"accuracy": acc, "macro_f1": macro_f1}
+def get_default_train_args(model, classifier, data, output_dir):
+    num_layers = pu.quant_layers(model)
+    freeze_layers = 0
+    batch_size = 12
+    if classifier == "cell":
+        epochs = 10
+        evaluation_strategy = "epoch"
+        load_best_model_at_end = True
+    else:
+        epochs = 1
+        evaluation_strategy = "no"
+        load_best_model_at_end = False
+    if num_layers == 6:
+        default_training_args = {
+            "learning_rate": 5e-5,
+            "lr_scheduler_type": "linear",
+            "warmup_steps": 500,
+            "per_device_train_batch_size": batch_size,
+            "per_device_eval_batch_size": batch_size,
+        }
+    training_args = {
+        "num_train_epochs": epochs,
+        "do_train": True,
+        "do_eval": True,
+        "evaluation_strategy": evaluation_strategy,
+        "logging_steps": np.floor(len(data) / batch_size / 8),  # 8 evals per epoch
+        "save_strategy": "epoch",
+        "group_by_length": False,
+        "length_column_name": "length",
+        "disable_tqdm": False,
+        "weight_decay": 0.001,
+        "load_best_model_at_end": load_best_model_at_end,
+    }
+    training_args.update(default_training_args)
+    return training_args, freeze_layers

geneformer/emb_extractor.py CHANGED Viewed

@@ -17,7 +17,6 @@ from pathlib import Path
 import anndata
 import matplotlib.pyplot as plt
-import numpy as np
 import pandas as pd
 import scanpy as sc
 import seaborn as sns
@@ -303,13 +302,6 @@ def make_colorbar(embs_df, label):
     cell_type_colors = gen_heatmap_class_colors(labels, embs_df)
     label_colors = pd.DataFrame(cell_type_colors, columns=[label])
-    for i, row in label_colors.iterrows():
-        colors = row[0]
-        if len(colors) != 3 or any(np.isnan(colors)):
-            print(i, colors)
-    label_colors.isna().sum()
     # create dictionary for colors and classes
     label_color_dict = gen_heatmap_class_dict(labels, label_colors[label])
     return label_colors, label_color_dict
@@ -565,7 +557,9 @@ class EmbExtractor:
                 filtered_input_data, cell_state, self.nproc
             )
         downsampled_data = pu.downsample_and_sort(filtered_input_data, self.max_ncells)
-        model = pu.load_model(self.model_type, self.num_classes, model_directory, mode = "eval")
         layer_to_quant = pu.quant_layers(model) + self.emb_layer
         embs = get_embs(
             model,

 import anndata
 import matplotlib.pyplot as plt
 import pandas as pd
 import scanpy as sc
 import seaborn as sns
     cell_type_colors = gen_heatmap_class_colors(labels, embs_df)
     label_colors = pd.DataFrame(cell_type_colors, columns=[label])
     # create dictionary for colors and classes
     label_color_dict = gen_heatmap_class_dict(labels, label_colors[label])
     return label_colors, label_color_dict
                 filtered_input_data, cell_state, self.nproc
             )
         downsampled_data = pu.downsample_and_sort(filtered_input_data, self.max_ncells)
+        model = pu.load_model(
+            self.model_type, self.num_classes, model_directory, mode="eval"
+        )
         layer_to_quant = pu.quant_layers(model) + self.emb_layer
         embs = get_embs(
             model,

geneformer/evaluation_utils.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import logging
+import math
+import pickle
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import torch
+from datasets.utils.logging import disable_progress_bar, enable_progress_bar
+from sklearn import preprocessing
+from sklearn.metrics import (
+    ConfusionMatrixDisplay,
+    accuracy_score,
+    auc,
+    confusion_matrix,
+    f1_score,
+    roc_curve,
+)
+from tqdm.auto import trange
+from .emb_extractor import make_colorbar
+from .tokenizer import TOKEN_DICTIONARY_FILE
+logger = logging.getLogger(__name__)
+# load token dictionary (Ensembl IDs:token)
+with open(TOKEN_DICTIONARY_FILE, "rb") as f:
+    gene_token_dict = pickle.load(f)
+def preprocess_classifier_batch(cell_batch, max_len, label_name):
+    if max_len is None:
+        max_len = max([len(i) for i in cell_batch["input_ids"]])
+    def pad_label_example(example):
+        example[label_name] = np.pad(
+            example[label_name],
+            (0, max_len - len(example["input_ids"])),
+            mode="constant",
+            constant_values=-100,
+        )
+        example["input_ids"] = np.pad(
+            example["input_ids"],
+            (0, max_len - len(example["input_ids"])),
+            mode="constant",
+            constant_values=gene_token_dict.get("<pad>"),
+        )
+        example["attention_mask"] = (
+            example["input_ids"] != gene_token_dict.get("<pad>")
+        ).astype(int)
+        return example
+    padded_batch = cell_batch.map(pad_label_example)
+    return padded_batch
+# Function to find the largest number smaller
+# than or equal to N that is divisible by k
+def find_largest_div(N, K):
+    rem = N % K
+    if rem == 0:
+        return N
+    else:
+        return N - rem
+def vote(logit_list):
+    m = max(logit_list)
+    logit_list.index(m)
+    indices = [i for i, x in enumerate(logit_list) if x == m]
+    if len(indices) > 1:
+        return "tie"
+    else:
+        return indices[0]
+def py_softmax(vector):
+    e = np.exp(vector)
+    return e / e.sum()
+def classifier_predict(model, classifier_type, evalset, forward_batch_size):
+    if classifier_type == "gene":
+        label_name = "labels"
+    elif classifier_type == "cell":
+        label_name = "label"
+    predict_logits = []
+    predict_labels = []
+    model.eval()
+    # ensure there is at least 2 examples in each batch to avoid incorrect tensor dims
+    evalset_len = len(evalset)
+    max_divisible = find_largest_div(evalset_len, forward_batch_size)
+    if len(evalset) - max_divisible == 1:
+        evalset_len = max_divisible
+    max_evalset_len = max(evalset.select([i for i in range(evalset_len)])["length"])
+    disable_progress_bar()  # disable progress bar for preprocess_classifier_batch mapping
+    for i in trange(0, evalset_len, forward_batch_size):
+        max_range = min(i + forward_batch_size, evalset_len)
+        batch_evalset = evalset.select([i for i in range(i, max_range)])
+        padded_batch = preprocess_classifier_batch(
+            batch_evalset, max_evalset_len, label_name
+        )
+        padded_batch.set_format(type="torch")
+        input_data_batch = padded_batch["input_ids"]
+        attn_msk_batch = padded_batch["attention_mask"]
+        label_batch = padded_batch[label_name]
+        with torch.no_grad():
+            outputs = model(
+                input_ids=input_data_batch.to("cuda"),
+                attention_mask=attn_msk_batch.to("cuda"),
+                labels=label_batch.to("cuda"),
+            )
+            predict_logits += [torch.squeeze(outputs.logits.to("cpu"))]
+            predict_labels += [torch.squeeze(label_batch.to("cpu"))]
+    enable_progress_bar()
+    logits_by_cell = torch.cat(predict_logits)
+    last_dim = len(logits_by_cell.shape) - 1
+    all_logits = logits_by_cell.reshape(-1, logits_by_cell.shape[last_dim])
+    labels_by_cell = torch.cat(predict_labels)
+    all_labels = torch.flatten(labels_by_cell)
+    logit_label_paired = [
+        item
+        for item in list(zip(all_logits.tolist(), all_labels.tolist()))
+        if item[1] != -100
+    ]
+    y_pred = [vote(item[0]) for item in logit_label_paired]
+    y_true = [item[1] for item in logit_label_paired]
+    logits_list = [item[0] for item in logit_label_paired]
+    return y_pred, y_true, logits_list
+def get_metrics(y_pred, y_true, logits_list, num_classes, labels):
+    conf_mat = confusion_matrix(y_true, y_pred, labels=list(labels))
+    macro_f1 = f1_score(y_true, y_pred, average="macro")
+    acc = accuracy_score(y_true, y_pred)
+    roc_metrics = None  # roc metrics not reported for multiclass
+    if num_classes == 2:
+        y_score = [py_softmax(item)[1] for item in logits_list]
+        fpr, tpr, _ = roc_curve(y_true, y_score)
+        mean_fpr = np.linspace(0, 1, 100)
+        interp_tpr = np.interp(mean_fpr, fpr, tpr)
+        interp_tpr[0] = 0.0
+        tpr_wt = len(tpr)
+        roc_auc = auc(fpr, tpr)
+        roc_metrics = {
+            "fpr": fpr,
+            "tpr": tpr,
+            "interp_tpr": interp_tpr,
+            "auc": roc_auc,
+            "tpr_wt": tpr_wt,
+        }
+    return conf_mat, macro_f1, acc, roc_metrics
+# get cross-validated mean and sd metrics
+def get_cross_valid_roc_metrics(all_tpr, all_roc_auc, all_tpr_wt):
+    wts = [count / sum(all_tpr_wt) for count in all_tpr_wt]
+    all_weighted_tpr = [a * b for a, b in zip(all_tpr, wts)]
+    mean_tpr = np.sum(all_weighted_tpr, axis=0)
+    mean_tpr[-1] = 1.0
+    all_weighted_roc_auc = [a * b for a, b in zip(all_roc_auc, wts)]
+    roc_auc = np.sum(all_weighted_roc_auc)
+    roc_auc_sd = math.sqrt(np.average((all_roc_auc - roc_auc) ** 2, weights=wts))
+    return mean_tpr, roc_auc, roc_auc_sd
+# plot ROC curve
+def plot_ROC(roc_metric_dict, model_style_dict, title, output_dir, output_prefix):
+    fig = plt.figure()
+    fig.set_size_inches(10, 8)
+    sns.set(font_scale=2)
+    sns.set_style("white")
+    lw = 3
+    for model_name in roc_metric_dict.keys():
+        mean_fpr = roc_metric_dict[model_name]["mean_fpr"]
+        mean_tpr = roc_metric_dict[model_name]["mean_tpr"]
+        roc_auc = roc_metric_dict[model_name]["roc_auc"]
+        roc_auc_sd = roc_metric_dict[model_name]["roc_auc_sd"]
+        color = model_style_dict[model_name]["color"]
+        linestyle = model_style_dict[model_name]["linestyle"]
+        if len(roc_metric_dict[model_name]["all_roc_auc"]) > 1:
+            label = f"{model_name} (AUC {roc_auc:0.2f} $\pm$ {roc_auc_sd:0.2f})"
+        else:
+            label = f"{model_name} (AUC {roc_auc:0.2f})"
+        plt.plot(
+            mean_fpr, mean_tpr, color=color, linestyle=linestyle, lw=lw, label=label
+        )
+    plt.plot([0, 1], [0, 1], color="black", lw=lw, linestyle="--")
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.05])
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.title(title)
+    plt.legend(loc="lower right")
+    plt.show()
+    output_file = (Path(output_dir) / f"{output_prefix}_roc").with_suffix(".pdf")
+    plt.savefig(output_file, bbox_inches="tight")
+# plot confusion matrix
+def plot_confusion_matrix(
+    conf_mat_df, title, output_dir, output_prefix, custom_class_order
+):
+    fig = plt.figure()
+    fig.set_size_inches(10, 10)
+    sns.set(font_scale=1)
+    sns.set_style("whitegrid", {"axes.grid": False})
+    if custom_class_order is not None:
+        conf_mat_df = conf_mat_df.reindex(
+            index=custom_class_order, columns=custom_class_order
+        )
+    display_labels = generate_display_labels(conf_mat_df)
+    conf_mat = preprocessing.normalize(conf_mat_df.to_numpy(), norm="l1")
+    display = ConfusionMatrixDisplay(
+        confusion_matrix=conf_mat, display_labels=display_labels
+    )
+    display.plot(cmap="Blues", values_format=".2g")
+    plt.title(title)
+    plt.show()
+    output_file = (Path(output_dir) / f"{output_prefix}_conf_mat").with_suffix(".pdf")
+    display.figure_.savefig(output_file, bbox_inches="tight")
+def generate_display_labels(conf_mat_df):
+    display_labels = []
+    i = 0
+    for label in conf_mat_df.index:
+        display_labels += [f"{label}\nn={conf_mat_df.iloc[i,:].sum():.0f}"]
+        i = i + 1
+    return display_labels
+def plot_predictions(predictions_df, title, output_dir, output_prefix, kwargs_dict):
+    sns.set(font_scale=2)
+    plt.figure(figsize=(10, 10), dpi=150)
+    label_colors, label_color_dict = make_colorbar(predictions_df, "true")
+    predictions_df = predictions_df.drop(columns=["true"])
+    predict_colors_list = [label_color_dict[label] for label in predictions_df.columns]
+    predict_label_list = [label for label in predictions_df.columns]
+    predict_colors = pd.DataFrame(
+        pd.Series(predict_colors_list, index=predict_label_list), columns=["predicted"]
+    )
+    default_kwargs_dict = {
+        "row_cluster": False,
+        "col_cluster": False,
+        "row_colors": label_colors,
+        "col_colors": predict_colors,
+        "linewidths": 0,
+        "xticklabels": False,
+        "yticklabels": False,
+        "center": 0,
+        "cmap": "vlag",
+    }
+    if kwargs_dict is not None:
+        default_kwargs_dict.update(kwargs_dict)
+    g = sns.clustermap(predictions_df, **default_kwargs_dict)
+    plt.setp(g.ax_row_colors.get_xmajorticklabels(), rotation=45, ha="right")
+    for label_color in list(label_color_dict.keys()):
+        g.ax_col_dendrogram.bar(
+            0, 0, color=label_color_dict[label_color], label=label_color, linewidth=0
+        )
+        g.ax_col_dendrogram.legend(
+            title=f"{title}",
+            loc="lower center",
+            ncol=4,
+            bbox_to_anchor=(0.5, 1),
+            facecolor="white",
+        )
+    output_file = (Path(output_dir) / f"{output_prefix}_pred").with_suffix(".pdf")
+    plt.savefig(output_file, bbox_inches="tight")

geneformer/in_silico_perturber_stats.py CHANGED Viewed

@@ -801,6 +801,12 @@ class InSilicoPerturberStats:
                         logger.error("All states must be unique.")
                         raise
             else:
                 logger.error(
                     "cell_states_to_model must only have the following four keys: "

                         logger.error("All states must be unique.")
                         raise
+            elif set(self.cell_states_to_model.keys()) == {
+                "state_key",
+                "start_state",
+                "goal_state",
+            }:
+                self.cell_states_to_model["alt_states"] = []
             else:
                 logger.error(
                     "cell_states_to_model must only have the following four keys: "

geneformer/tokenizer.py CHANGED Viewed

@@ -43,12 +43,13 @@ from pathlib import Path
 from typing import Literal
 import anndata as ad
-import loompy as lp
 import numpy as np
 import scipy.sparse as sp
 from datasets import Dataset
-warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")
 logger = logging.getLogger(__name__)
 GENE_MEDIAN_FILE = Path(__file__).parent / "gene_median_dictionary.pkl"
@@ -81,7 +82,7 @@ class TranscriptomeTokenizer:
         custom_attr_name_dict=None,
         nproc=1,
         chunk_size=512,
-        input_size=2048,
         special_token=False,
         gene_median_file=GENE_MEDIAN_FILE,
         token_dictionary_file=TOKEN_DICTIONARY_FILE,
@@ -95,10 +96,10 @@ class TranscriptomeTokenizer:
             | Values are the names of the attributes in the dataset.
         nproc : int
             | Number of processes to use for dataset mapping.
-        chunk_size: int = 512
             | Chunk size for anndata tokenizer.
-        input_size: int = 2048
-            | Input size for tokenization
         special_token: bool = False
             | Option to add CLS and SEP tokens
         gene_median_file : Path
@@ -117,7 +118,7 @@ class TranscriptomeTokenizer:
         self.chunk_size = chunk_size
         # input size for tokenization
-        self.input_size = input_size
         # add CLS and SEP tokens
         self.special_token = special_token
@@ -163,7 +164,9 @@ class TranscriptomeTokenizer:
             Path(data_directory), file_format
         )
         tokenized_dataset = self.create_dataset(
-            tokenized_cells, cell_metadata, use_generator=use_generator
         )
         output_path = (Path(output_directory) / output_prefix).with_suffix(".dataset")
@@ -332,7 +335,7 @@ class TranscriptomeTokenizer:
                         file_cell_metadata[k] += subview.ca[k].tolist()
                 else:
                     file_cell_metadata = None
         return tokenized_cells, file_cell_metadata
     def create_dataset(
@@ -367,12 +370,20 @@ class TranscriptomeTokenizer:
             # Truncate/Crop input_ids to input size
             if self.special_token:
-                example["input_ids"] = example["input_ids"][0:self.input_size-2] # truncate to leave space for CLS and SEP token
-                example["input_ids"] = np.insert(example["input_ids"], 0, self.gene_token_dict.get("<cls>"))
-                example["input_ids"] = np.insert(example["input_ids"], len(example["input_ids"]), self.gene_token_dict.get("<sep>"))
             else:
                 # Truncate/Crop input_ids to input size
-                example["input_ids"] = example["input_ids"][0:self.input_size]
             example["length"] = len(example["input_ids"])
             return example
@@ -380,4 +391,4 @@ class TranscriptomeTokenizer:
         output_dataset_truncated = output_dataset.map(
             format_cell_features, num_proc=self.nproc
         )
-        return output_dataset_truncated

 from typing import Literal
 import anndata as ad
 import numpy as np
 import scipy.sparse as sp
 from datasets import Dataset
+warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")  # noqa
+import loompy as lp  # noqa
 logger = logging.getLogger(__name__)
 GENE_MEDIAN_FILE = Path(__file__).parent / "gene_median_dictionary.pkl"
         custom_attr_name_dict=None,
         nproc=1,
         chunk_size=512,
+        model_input_size=2048,
         special_token=False,
         gene_median_file=GENE_MEDIAN_FILE,
         token_dictionary_file=TOKEN_DICTIONARY_FILE,
             | Values are the names of the attributes in the dataset.
         nproc : int
             | Number of processes to use for dataset mapping.
+        chunk_size : int = 512
             | Chunk size for anndata tokenizer.
+        model_input_size: int = 2048
+            | Max input size of model to truncate input to.
         special_token: bool = False
             | Option to add CLS and SEP tokens
         gene_median_file : Path
         self.chunk_size = chunk_size
         # input size for tokenization
+        self.model_input_size = model_input_size
         # add CLS and SEP tokens
         self.special_token = special_token
             Path(data_directory), file_format
         )
         tokenized_dataset = self.create_dataset(
+            tokenized_cells,
+            cell_metadata,
+            use_generator=use_generator,
         )
         output_path = (Path(output_directory) / output_prefix).with_suffix(".dataset")
                         file_cell_metadata[k] += subview.ca[k].tolist()
                 else:
                     file_cell_metadata = None
         return tokenized_cells, file_cell_metadata
     def create_dataset(
             # Truncate/Crop input_ids to input size
             if self.special_token:
+                example["input_ids"] = example["input_ids"][
+                    0 : self.model_input_size - 2
+                ]  # truncate to leave space for CLS and SEP token
+                example["input_ids"] = np.insert(
+                    example["input_ids"], 0, self.gene_token_dict.get("<cls>")
+                )
+                example["input_ids"] = np.insert(
+                    example["input_ids"],
+                    len(example["input_ids"]),
+                    self.gene_token_dict.get("<sep>"),
+                )
             else:
                 # Truncate/Crop input_ids to input size
+                example["input_ids"] = example["input_ids"][0 : self._model_input_size]
             example["length"] = len(example["input_ids"])
             return example
         output_dataset_truncated = output_dataset.map(
             format_cell_features, num_proc=self.nproc
         )
+        return output_dataset_truncated