File size: 4,917 Bytes

e562c0c
 
025e1b8
e562c0c
025e1b8
 
 
 
 
 
 
e562c0c
025e1b8
 
 
 
 
 
 
 
e562c0c
 
 
025e1b8
 
e562c0c
025e1b8
 
e562c0c
025e1b8
e562c0c
 
 
 
 
 
 
 
025e1b8
e562c0c
 
025e1b8
 
e562c0c
025e1b8
 
e562c0c
025e1b8
e562c0c
 
025e1b8
e562c0c
 
 
025e1b8
 
e562c0c
 
025e1b8
e562c0c
 
025e1b8
e562c0c
 
025e1b8
e562c0c
 
025e1b8
e562c0c
 
025e1b8
e562c0c
 
025e1b8
e562c0c
025e1b8
 
e562c0c
 
025e1b8
e562c0c
 
025e1b8
e562c0c
025e1b8
e562c0c
025e1b8
e562c0c
 
025e1b8
e562c0c
025e1b8
 
 
 
 
e562c0c
025e1b8
 
 
 
 
 
 
 
 
 
 
e562c0c
025e1b8
e562c0c
 
 
025e1b8
e562c0c
 
025e1b8
e562c0c
 
025e1b8
e562c0c
 
025e1b8
e562c0c
 
025e1b8
e562c0c
 
025e1b8
e562c0c
 
025e1b8
e562c0c
 
025e1b8
e562c0c
 
025e1b8
 
e562c0c
 
025e1b8
e562c0c
 
025e1b8
e562c0c
 
025e1b8
e562c0c
 
025e1b8
e562c0c
 
025e1b8
 
e562c0c

# Cell classifier
def finetune_cells(token_set = Path('geneformer/token_dictionary.pkl'), median_set = Path('geneformer/gene_median_dictionary.pkl'), pretrained_model = ".",
 dataset = 'Genecorpus-30M/example_input_files/cell_classification/cell_type_annotation/cell_type_train_data.dataset/',
 dataset_split = None,
  filter_cells = .005,
  epochs = 1,
  cpu_cores = os.cpu_count(),
  geneformer_batch_size = 12,
  optimizer = 'adamw',
  max_lr = 5e-5,
  num_gpus = torch.cuda.device_count(),
  max_input_size = 2 ** 11,
  lr_schedule_fn = "linear",
  warmup_steps = 500,
  freeze_layers = 0,
  emb_extract = False,
  max_cells = 1000,
  emb_layer = 0,
  emb_filter = None,
  emb_dir = 'embeddings',
  overwrite = True,
  label = "cell_type",
  data_filter = None,
  forward_batch = 200, model_location = None,
  skip_training = False,
  sample_data = 1,
   inference = False,
   optimize_hyperparameters = False,
   output_dir = None):

    '''
    Primary Parameters
    -------------------
    dataset: path
        Path to fine-tuning/testing dataset for training

    model_location: path
        Path to location of existing model to use for inference and embedding extraction

    pretrained_model: path
        Path to pretrained GeneFormer 30M model before fine-tuning

    inference: bool
        Chooses whether to perform inference (which causes the function to return the list of similarities). Defaults to False

    skip_training: bool
        Chooses whether to skip training the model. Defaults to False

    emb_extract: bool
        Choose whether to extract embeddings and calculate similarities. Defaults to True

    optimize_hyperparameters: bool
        Choose whether to optimize model hyperparamters. Defaults to False
    label: string
		The label string in the formatted dataset that contains true class labels. Defaults to "label"

    Customization Parameters
    -------------------

    dataset_split: str
        How the dataset should be partitioned (if at all), and what ID should be used for partitioning

    data_filter: list
        (For embeddings and inference) Runs analysis subsets of the dataset by the ID defined by dataset_split

    label: str
        What feature should be read as a classification label

    emb_layer: int
        What layer embeddings should be extracted and compared from.

    emb_filter: ['cell1', 'cell2'...]
        Allows user to narrow down range of cells that embeddings will be extracted from.

    max_cells: int
        How many embeddings from cells should be extracted.

    freeze_layers: int
        Number of layers should be permanently frozen during fine-tuning (starting from the first layer, 4 brings it up to the pretrained model).

    sample_data: float
        What proportion of the HF dataset should be used

    '''

   # Gene Classifier
   def classify_genes(gene_info = "Genecorpus-30M/example_input_files/gene_info_table.csv",
   genes = "Genecorpus-30M/example_input_files/gene_classification/dosage_sensitive_tfs/dosage_sens_tf_labels.csv",
  corpus_30M = "Genecorpus-30M/genecorpus_30M_2048.dataset/", model = '.',
  max_input_size = 2 ** 11,
  max_lr = 5e-5,
  freeze_layers = 4,
  num_gpus = 1,
  num_proc = os.cpu_count(),
  geneformer_batch_size = 9,
  epochs = 1,
  filter_dataset = 50_000,
  emb_extract = True,
  emb_layer = 0,
  forward_batch = 200,
  filter_data = None,
  inference = False,
  k_validate = True,
  model_location = "230917_geneformer_GeneClassifier_dosageTF_L2048_B12_LR5e-05_LSlinear_WU500_E1_Oadamw_n10000_F4/",
  skip_training = False,
  emb_dir = 'gene_emb',
  output_dir = None,
  max_cells = 1000,
  num_cpus = os.cpu_count()):

    """"
    Primary Parameters
    -----------

    gene_info: path
        Path to gene mappings

    corpus_30M: path
        Path to 30M Gene Corpus

    model: path
        Path to pretrained GeneFormer model

    genes: path
        Path to csv file containing different columns of genes and the column labels

    inference: bool
        Whether the model should be used to run inference. If False, model will train with labeled data instead. Defaults to False

    k_validate: bool
        Whether the model should run k-fold validation or simply perform regular training/evaluate. Defaults to True

    skip_training: bool
        Whether the model should skip the training portion. Defaults to False

    emb_extract: bool
        WHether the model should extract embeddings for a given gene (WIP)


    Customization Parameters
    -----------

    freeze_layers: int
        Freezes x number of layers from the model. Default is 4 (2 non-frozen layers)

    filter_dataset: int
        Number of cells to filter from 30M dataset. Default is 50_000

    emb_layer: int
        What layer embeddings are extracted from. Default is 4

    filter_data: str, list
        Filters down embeddings to a single category. Default is None


    """