update cell classifier module

Browse files

Files changed (7) hide show

.pre-commit-config.yaml +25 -0
Immune_modelpredictions.pickle +0 -0
gene_embclasses.txt +0 -1
gene_embdataset.pk +0 -0
Cell_classifier.py → geneformer/cell_classifier.py +612 -599
Gene_classifier.py → geneformer/gene_classifier.py +448 -259
Modular_usage.md → geneformer/modular_classifier_usage.md +76 -76

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.2.0
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+    -   id: check-yaml
+    -   id: check-added-large-files
+    -   id: check-merge-conflict
+    -   id: mixed-line-ending
+    -   id: check-docstring-first
+-   repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+    -   id: isort
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.1.4
+    hooks:
+    # Run the Ruff linter.
+    -   id: ruff
+    # Run the Ruff formatter.
+    -   id: ruff-format

Immune_modelpredictions.pickle DELETED Viewed

Binary file (99.1 kB)

gene_embclasses.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- [2, {0: 0, 1: 0}]

gene_embdataset.pk DELETED Viewed

Binary file (1.76 kB)

Cell_classifier.py → geneformer/cell_classifier.py RENAMED Viewed

@@ -1,65 +1,95 @@
-# Package Imports
-import tqdm
-import sys
-import polars as pl
-import pysam
 import os
-from datasets import Dataset
-from collections import Counter
 import random
-import datetime
-from pathlib import Path
 import subprocess
-import seaborn as sns; sns.set()
-from datasets import load_from_disk
-import fastcluster
-from sklearn.metrics import accuracy_score, f1_score
-from transformers import BertForSequenceClassification
-from transformers import Trainer
-from transformers.training_args import TrainingArguments
-from geneformer import DataCollatorForCellClassification, EmbExtractor
-import pickle
-from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve
-from sklearn.metrics import auc as precision_auc
-from sklearn.preprocessing import label_binarize
-import pyarrow as pa
-import concurrent.futures
-from matplotlib import pyplot as plt
 import torch
 import torch.nn.functional as F
-from scipy.stats import ranksums
-import ray
-import ast
 from ray import tune
-from ray.tune import ExperimentAnalysis
 from ray.tune.search.hyperopt import HyperOptSearch
-import numpy as np
 # Properly sets up NCCV environment
-GPU_NUMBER = [i for i in range(torch.cuda.device_count())]
 os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(s) for s in GPU_NUMBER])
 os.environ["NCCL_DEBUG"] = "INFO"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Function for generating a ROC curve from data
-def ROC(prediction, truth, type = 'GeneFormer', label = ''):
     fpr, tpr, _ = roc_curve(truth, prediction[:, 1])
     auc = roc_auc_score(truth, prediction[:, 1])
-    print(f'{type} AUC: {auc}')
-    plt.plot(fpr,tpr, label="AUC="+str(auc))
-    plt.ylabel('True Positive Rate')
-    plt.xlabel('False Positive Rate')
-    plt.title(f'{label} ROC Curve')
     plt.legend(loc=4)
-    plt.savefig('ROC.png')
-    return tpr, fpr, auc
 # Identifies cosine similarity between two embeddings. 0 is perfectly dissimilar and 1 is perfectly similar
-def similarity(tensor1, tensor2, cosine = False):
-    if cosine == False:
         if tensor1.ndimension() > 1:
             tensor1 = tensor1.view(1, -1)
         if tensor2.ndimension() > 1:
@@ -69,7 +99,7 @@ def similarity(tensor1, tensor2, cosine = False):
         norm_tensor2 = torch.norm(tensor2)
         epsilon = 1e-8
         similarity = dot_product / (norm_tensor1 * norm_tensor2 + epsilon)
-        similarity = (similarity.item() + 1)/2
     else:
         if tensor1.shape != tensor2.shape:
             raise ValueError("Input tensors must have the same shape.")
@@ -78,13 +108,14 @@ def similarity(tensor1, tensor2, cosine = False):
         dot_product = torch.dot(tensor1, tensor2)
         norm_tensor1 = torch.norm(tensor1)
         norm_tensor2 = torch.norm(tensor2)
         # Avoid division by zero by adding a small epsilon
         epsilon = 1e-8
         similarity = dot_product / (norm_tensor1 * norm_tensor2 + epsilon)
     return similarity.item()
 # Plots heatmap between different classes/labels
 def plot_similarity_heatmap(similarities):
     classes = list(similarities.keys())
@@ -97,327 +128,256 @@ def plot_similarity_heatmap(similarities):
             else:
                 val = similarities[c][cc]
             arr[i][j] = val
     plt.figure(figsize=(8, 6))
-    plt.imshow(arr, cmap='inferno', vmin=0, vmax=1)
     plt.colorbar()
-    plt.xticks(np.arange(classlen), classes, rotation = 45, ha = 'right')
     plt.yticks(np.arange(classlen), classes)
     plt.title("Similarity Heatmap")
     plt.savefig("similarity_heatmap.png")
-# Function for tokenizing genes into ranked-value encodings from Geneformer
-def tokenize_dataset(gene_set, type = None, token_set = 'token_dictionary.pkl', species = 'human'):
-    token_dataset = open(token_set, 'rb')
-    token_dict = pickle.load(token_dataset)
-    wrap = True
-    if isinstance(gene_set[0], list) == False:
-        gene_set = [gene_set]
-        wrap = False
-    pool = Pool()
-    converted_set = []
-    def process_gene(gene):
-         api_url = f"https://rest.ensembl.org/xrefs/symbol/{species}/{gene}?object_type=gene"
-         response = requests.get(api_url, headers={"Content-Type": "application/json"})
-         try:
-             data = response.json()
-             gene = data[0]['id']
-         except:
-             gene = None
-         return gene
-    def process_hgnc(gene):
-        for gene in tqdm.tqdm(genes, total = len(genes)):
-            api_url = f"https://rest.ensembl.org/xrefs/symbol/{species}/{hgnc_id}?object_type=gene"
-            response = requests.get(api_url, headers={"Content-Type": "application/json"})
-            try:
-                data = response.json()
-                gene = data[0]['id']
-            except:
-                gene = None
-            return gene
-    def process_go(gene):
-         mg = mygene.MyGeneInfo()
-         results = mg.query(gene, scopes="go", species=species, fields="ensembl.gene")
-         ensembl_ids = []
-         max_score = 0
-         for hit_num, hit in enumerate(results["hits"]):
-             if hit['_score'] > max_score:
-                 max_score = hit['_score']
-                 chosen_hit = hit
-         try:
-             try:
-                 gene = chosen_hit["ensembl"]["gene"]
-             except:
-                 gene = chosen_hit["ensembl"][0]["gene"]
-         except:
-             gene = None
-         return gene
-    if type == None or type.upper() == 'ENSEMBL':
-        converted_set = gene_set
-    elif type.upper() == 'GENE':
-        for genes in gene_set:
-            converted_genes = []
-            for result in tqdm.tqdm(pool.imap(process_gene, genes), total = len(genes)):
-                converted_genes.append(result)
-            converted_set.append(converted_genes)
-    elif type.upper() == 'GO':
-        for genes in gene_set:
-            converted_genes = []
-            for result in tqdm.tqdm(pool.imap(process_go, genes), total = len(genes)):
-                converted_genes.append(result)
-            converted_set.append(converted_genes)
-    elif type.upper() == 'HGNC':
-        for genes in gene_set:
-            converted_genes = []
-            for result in tqdm.tqdm(pool.imap(process_hgnc, genes), total = len(genes)):
-                converted_genes.append(result)
-            converted_set.append(converted_genes)
-    Chembl = []
-    for set_num, set in enumerate(converted_set):
-        Chembl.append([])
-        for gene in set:
-            if gene == None:
-                Chembl[set_num].append(None)
-            else:
-                try:
-                    Chembl[set_num].append(token_dict[gene])
-                except:
-                    print(f'{gene} not found in tokenized dataset!')
-                    Chembl[set_num].append(None)
-    if wrap == False:
-        Chembl = Chembl[0]
-    return Chembl
-# '/work/ccnr/GeneFormer/GeneFormer_repo/Genecorpus-30M/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset/'
-# '/work/ccnr/GeneFormer/GeneFormer_repo/Genecorpus-30M/example_input_files/cell_classification/cell_type_annotation/cell_type_train_data.dataset/'
-'''
-======================================================
-PRIMARY CELL - CLASSIFIER AND EMBEDDING EXTRACTOR CLASS
-+++++++++++++++++++++++++++++++++++++++++++++++++++++++
-Runs cell-level classification and embedding extraction with Geneformer
-'''
-def finetune_cells(token_set = Path('geneformer/token_dictionary.pkl'), median_set = Path('geneformer/gene_median_dictionary.pkl'), pretrained_model = ".",
-          dataset = 'Genecorpus-30M/example_input_files/cell_classification/cell_type_annotation/cell_type_train_data.dataset/', dataset_split = None, filter_cells = .005, epochs = 1, cpu_cores = os.cpu_count(), geneformer_batch_size = 12, optimizer = 'adamw', max_lr = 5e-5, num_gpus = torch.cuda.device_count(), max_input_size = 2 ** 11, lr_schedule_fn = "linear", warmup_steps = 500, freeze_layers = 0, emb_extract = False, max_cells = 1000, emb_layer = 0, emb_filter = None, emb_dir = 'embeddings', overwrite = True, label = "cell_type", data_filter = None,
-          forward_batch = 200, model_location = None, skip_training = False, sample_data = 1, inference = False, optimize_hyperparameters = False, output_dir = None):
-    '''
     Primary Parameters
     -------------------
     dataset: path
-        Path to fine-tuning/testing dataset for training
-    model_location: path
-        Path to location of existing model to use for inference and embedding extraction
     pretrained_model: path
-        Path to pretrained GeneFormer 30M model before fine-tuning
-    inference: bool
-        Chooses whether to perform inference (which causes the function to return the list of similarities). Defaults to False
-    skip_training: bool
-        Chooses whether to skip training the model. Defaults to False
     emb_extract: bool
-        Choose whether to extract embeddings and calculate similarities. Defaults to True
     optimize_hyperparameters: bool
-        Choose whether to optimize model hyperparamters. Defaults to False
     Customization Parameters
     -------------------
     dataset_split: str
-        How the dataset should be partitioned (if at all), and what ID should be used for partitioning
     data_filter: list
-        (For embeddings and inference) Runs analysis subsets of the dataset by the ID defined by dataset_split
     label: str
-        What feature should be read as a classification label
     emb_layer: int
-        What layer embeddings should be extracted and compared from.
     emb_filter: ['cell1', 'cell2'...]
         Allows user to narrow down range of cells that embeddings will be extracted from.
     max_cells: int
-        How many embeddings from cells should be extracted.
     freeze_layers: int
-        Number of layers should be permanently frozen during fine-tuning (starting from the first layer, 4 brings it up to the pretrained model).
     sample_data: float
-        What proportion of the HF dataset should be used
-    '''`
     dataset_list = []
     evalset_list = []
     split_list = []
     target_dict_list = []
-    '''
-    For loading and pretraining with custom median expressions and/or custom gene conversions
-    -------------------------------------------------------------
-    token set: path
-        Path to token conversion dictionary
-    median set: path
-        Path to median gene dictionary (ensembl IDs as the keys)
-    median_data = pickle.load(open(median_set, 'rb'))
-    median_data['<pad>'] = None
-    median_data['<mask>'] = None
-    token_set = pickle.load(open(token_set, 'rb'))
-    median_dict = {key:median_data[key] for key in list(token_set.keys())}
-    '''
     train_dataset = load_from_disk(dataset)
     num_samples = int(len(train_dataset) * sample_data)
     random_indices = random.sample(range(len(train_dataset)), num_samples)
     train_dataset = train_dataset.select(random_indices)
-    sample = int(sample_data  * len(train_dataset))
     sample_indices = random.sample(range(len(train_dataset)), sample)
     train_dataset = train_dataset.select(sample_indices)
-    def if_not_rare_celltype(example):
         return example[label] in cells_to_keep
     # change labels to numerical ids
     def classes_to_ids(example):
         example["label"] = target_name_id_dict[example["label"]]
         return example
     def if_trained_label(example):
-        return example["label"] in trained_labels
-    if skip_training != True:
         def compute_metrics(pred):
             labels = pred.label_ids
             preds = pred.predictions.argmax(-1)
             # calculate accuracy and macro f1 using sklearn's function
             acc = accuracy_score(labels, preds)
-            macro_f1 = f1_score(labels, preds, average='macro')
-            return {
-              'accuracy': acc,
-              'macro_f1': macro_f1
-            }
         # Defines custom exceptions for collecting labels (default excluded)
-        excep = {"bone_marrow":"immune"}
-        if dataset_split != None:
-            if data_filter != None:
-                 split_iter = [data_filter]
             else:
-                 split_iter = Counter(train_dataset[dataset_split]).keys()
             for lab in split_iter:
                 # collect list of tissues for fine-tuning (immune and bone marrow are included together)
                 if lab in list(excep.keys()):
                     continue
                 elif lab == list(excep.values()):
-                    split_ids = [excep.keys(),excep.values()]
                     split_list += [excep.values()]
                 else:
                     split_ids = [lab]
                     split_list += [lab]
                 # filter datasets for given organ
                 def if_label(example):
                     return example[dataset_split] == lab
                 trainset_label = train_dataset.filter(if_label, num_proc=cpu_cores)
                 label_counter = Counter(trainset_label[label])
                 total_cells = sum(label_counter.values())
-                # Throws out cells with a low proportion in the dataset (drop cell types representing <0.5% of cells per deepsort published method)
-                cells_to_keep = [k for k,v in label_counter.items() if v>(filter_cells*total_cells)]
-                trainset_label_subset = trainset_label.filter(if_not_rare_celltype, num_proc=cpu_cores)
                 # shuffle datasets and rename columns
                 trainset_label_shuffled = trainset_label_subset.shuffle(seed=42)
-                trainset_label_shuffled = trainset_label_shuffled.rename_column(label,"label")
-                trainset_label_shuffled = trainset_label_shuffled.remove_columns(dataset_split)
                 # create dictionary of cell types : label ids
                 target_names = list(Counter(trainset_label_shuffled["label"]).keys())
-                target_name_id_dict = dict(zip(target_names,[i for i in range(len(target_names))]))
                 target_dict_list += [target_name_id_dict]
-                labeled_trainset = trainset_label_shuffled.map(classes_to_ids, num_proc=cpu_cores)
                 # create 80/20 train/eval splits
-                labeled_train_split = trainset_label_shuffled.select([i for i in range(0,round(len(labeled_trainset)*0.8))])
-                labeled_eval_split = trainset_label_shuffled.select([i for i in range(round(len(labeled_trainset)*0.8),len(labeled_trainset))])
                 # filter dataset for cell types in corresponding training set
                 trained_labels = list(Counter(labeled_train_split["label"]).keys())
-                labeled_eval_split_subset = labeled_eval_split.filter(if_trained_label, num_proc=cpu_cores)
                 dataset_list += [labeled_train_split]
                 evalset_list += [labeled_eval_split_subset]
-            trainset_dict = dict(zip(split_list,dataset_list))
-            traintargetdict_dict = dict(zip(split_list,target_dict_list))
-            evalset_dict = dict(zip(split_list,evalset_list))
             for lab in split_list:
                 label_trainset = trainset_dict[lab]
                 label_evalset = evalset_dict[lab]
                 label_dict = traintargetdict_dict[lab]
                 # set logging steps
-                logging_steps = round(len(label_trainset)/geneformer_batch_size/10)
                 if logging_steps == 0:
                     logging_steps = 1
-                # reload pretrained model
-                model = BertForSequenceClassification.from_pretrained("/work/ccnr/GeneFormer/GeneFormer_repo",
-                                                                  num_labels=len(label_dict.keys()),
-                                                                  output_attentions = False,
-                                                                  output_hidden_states = False).to(device)
                 # define output directory path
                 current_date = datetime.datetime.now()
                 datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
-                if output_dir == None:
-                    output_dir = f"{datestamp}_geneformer_CellClassifier_{lab}_L{max_input_size}_B{geneformer_batch_size}_LR{max_lr}_LS{lr_schedule_fn}_WU{warmup_steps}_E{epochs}_O{optimizer}_F{freeze_layers}/"
                 # ensure not overwriting previously saved model
-                saved_model_test = os.path.join(output_dir, f"pytorch_model.bin")
-                if os.path.isfile(saved_model_test) == True and overwrite == False:
                     raise Exception("Model already saved to this directory.")
                 # make output directory
-                subprocess.call(f'mkdir -p {output_dir}', shell=True)
                 # set training arguments
                 training_args = {
                     "learning_rate": max_lr,
@@ -432,332 +392,371 @@ def finetune_cells(token_set = Path('geneformer/token_dictionary.pkl'), median_s
                     "lr_scheduler_type": lr_schedule_fn,
                     "warmup_steps": warmup_steps,
                     "weight_decay": 0.001,
-                    "per_device_train_batch_size": geneformer_batch_size,
-                    "per_device_eval_batch_size": geneformer_batch_size,
                     "num_train_epochs": epochs,
                     "load_best_model_at_end": True,
                     "output_dir": output_dir,
                 }
                 training_args_init = TrainingArguments(**training_args)
-                true_labels = label_evalset['label']
-                if optimize_hyperparameters == False:
                     # create the trainer
                     trainer = Trainer(
-                    model=model,
-                    args=training_args_init,
-                    data_collator=DataCollatorForCellClassification(),
-                    train_dataset=label_trainset,
-                    eval_dataset=label_evalset,
-                    compute_metrics=compute_metrics
                     )
                     # train the cell type classifier
                     trainer.train()
                     predictions = trainer.predict(label_evalset)
-                    print(f'accuracy: {accuracy_score(predictions.argmax(), label_evalset["labels"])}')
                     tpr, fpr, auc = ROC(predictions.predictions, true_labels)
                     metrics = compute_metrics(predictions)
                     with open(f"{output_dir}predictions.pickle", "wb") as fp:
                         pickle.dump(predictions, fp)
-                    trainer.save_metrics("eval",predictions.metrics)
-                    with open(f'{output_dir}/targets.txt', 'w') as f:
                         if len(target_dict_list) == 1:
                             f.write(str(target_dict_list[0]))
                         else:
                             f.write(str(target_dict_list))
-                     try:
-                         precision, recall, _ = precision_recall_curve(true_labels, predictions.predictions[:, 1])
-                         pr_auc = precision_auc(recall, precision)
-                         print(f'AUC: {pr_auc}')
-                         return recall, precision, pr_auc
-                     except:
-                         pass
                     trainer.save_model(output_dir)
                 else:
                     def model_init():
-                        model = BertForSequenceClassification.from_pretrained(pretrained_model,
-                                                              num_labels=len(label_dict.keys()),
-                                                              output_attentions = False,
-                                                              output_hidden_states = False)
                         if freeze_layers is not None:
                             modules_to_freeze = model.bert.encoder.layer[:freeze_layers]
                             for module in modules_to_freeze:
                                 for param in module.parameters():
                                     param.requires_grad = False
-                        model = model.to(device)
                         return model
                     trainer = Trainer(
                         model_init=model_init,
                         args=training_args_init,
                         data_collator=DataCollatorForCellClassification(),
                         train_dataset=label_trainset,
                         eval_dataset=label_evalset,
-                        compute_metrics=compute_metrics
                     )
                     # specify raytune hyperparameter search space
                     ray_config = {
                         "num_train_epochs": tune.choice([epochs]),
                         "learning_rate": tune.loguniform(1e-6, 1e-3),
                         "weight_decay": tune.uniform(0.0, 0.3),
-                        "lr_scheduler_type": tune.choice(["linear","cosine","polynomial"]),
                         "warmup_steps": tune.uniform(100, 2000),
-                        "seed": tune.uniform(0,100),
-                        "per_device_train_batch_size": tune.choice([geneformer_batch_size])
                     }
-                    hyperopt_search = HyperOptSearch(
-                        metric="eval_accuracy", mode="max")
-                    if torch.device == 'cuda':
-                        resources_per_trial={"cpu":8,"gpu":1},
                     else:
-                        resources_per_trial={"cpu":8}
                     # optimize hyperparameters
                     best_trial = trainer.hyperparameter_search(
                         direction="maximize",
                         backend="ray",
-                        resources_per_trial = resources_per_trial,
                         hp_space=lambda _: ray_config,
                         search_alg=hyperopt_search,
-                        n_trials=10, # number of trials
-                        progress_reporter=tune.CLIReporter(max_report_frequency=600,
-                                                                       sort_by_metric=True,
-                                                                       max_progress_rows=100,
-                                                                       mode="max",
-                                                                       metric="eval_accuracy",
-                                                                       metric_columns=["loss", "eval_loss", "eval_accuracy"]))
                     best_hyperparameters = best_trial.hyperparameters
                     print("Best Hyperparameters:")
                     print(best_hyperparameters)
         else:
-             trainset_label = train_dataset
-             label_counter = Counter(trainset_label[label])
-             total_cells = sum(label_counter.values())
-             # Throws out cells with a low proportion in the dataset
-             cells_to_keep = [k for k,v in label_counter.items() if v>(filter_cells*total_cells)]
-             trainset_label_subset = trainset_label.filter(if_not_rare_celltype, num_proc=cpu_cores)
-             # shuffle datasets and rename columns
-             trainset_label_shuffled = trainset_label_subset.shuffle(seed=42)
-             trainset_label_shuffled = trainset_label_shuffled.rename_column(label,"label")
-             # create dictionary of cell types : label ids
-             target_names = list(Counter(trainset_label_shuffled["label"]).keys())
-             target_name_id_dict = dict(zip(target_names,[i for i in range(len(target_names))]))
-             target_dict_list = target_name_id_dict
-             labeled_trainset = trainset_label_shuffled.map(classes_to_ids, num_proc=cpu_cores)
-             # create 80/20 train/eval splits
-             labeled_train_split = labeled_trainset.select([i for i in range(0,round(len(labeled_trainset)*0.8))])
-             labeled_eval_split = labeled_trainset.select([i for i in range(round(len(labeled_trainset)*0.8),len(labeled_trainset))])
-             # filter dataset for cell types in corresponding training set
-             trained_labels = list(Counter(labeled_train_split["label"]).keys())
-             labeled_eval_split_subset = labeled_eval_split.filter(if_trained_label, num_proc=cpu_cores)
-             # set logging steps
-             logging_steps = round(len(trainset_label)/geneformer_batch_size/10)
-             # reload pretrained model
-             model = BertForSequenceClassification.from_pretrained(pretrained_model,
-                                                                      num_labels=len(target_dict_list.keys()),
-                                                                      output_attentions = False,
-                                                                      output_hidden_states = False).to(device)
-             # define output directory path
-             current_date = datetime.datetime.now()
-             datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
-             if output_dir == None:
-                 output_dir = f"{datestamp}_geneformer_CellClassifier_L{max_input_size}_B{geneformer_batch_size}_LR{max_lr}_LS{lr_schedule_fn}_WU{warmup_steps}_E{epochs}_O{optimizer}_F{freeze_layers}/"
-             # ensure not overwriting previously saved model
-             saved_model_test = os.path.join(output_dir, f"pytorch_model.bin")
-             if os.path.isfile(saved_model_test) == True and overwrite == False:
-                 raise Exception("Model already saved to this directory.")
-             # make output directory
-             subprocess.call(f'mkdir -p {output_dir}', shell=True)
-             # set training arguments
-             training_args = {
-               "learning_rate": max_lr,
-               "do_train": True,
-               "do_eval": True,
-               "evaluation_strategy": "epoch",
-               "save_strategy": "epoch",
-               "logging_steps": logging_steps,
-               "group_by_length": True,
-               "length_column_name": "length",
-               "disable_tqdm": False,
-               "lr_scheduler_type": lr_schedule_fn,
-               "warmup_steps": warmup_steps,
-               "weight_decay": 0.001,
-               "per_device_train_batch_size": geneformer_batch_size,
-               "per_device_eval_batch_size": geneformer_batch_size,
-               "num_train_epochs": epochs,
-               "load_best_model_at_end": True,
-               "output_dir": output_dir,}
-             training_args_init = TrainingArguments(**training_args)
-             true_labels = labeled_eval_split_subset['label']
-             if optimize_hyperparameters == False:
-                 # create the trainer
-                 trainer = Trainer(
-                   model=model,
-                   args=training_args_init,
-                   data_collator=DataCollatorForCellClassification(),
-                   train_dataset=labeled_train_split,
-                   eval_dataset=labeled_eval_split_subset,
-                   compute_metrics=compute_metrics
-                 )
-                 # train the cell type classifier
-                 trainer.train()
-                 predictions = trainer.predict(labeled_eval_split_subset)
-                 predictions_tensor = torch.Tensor(predictions.predictions)
-                 predicted_labels = torch.argmax(predictions_tensor, dim=1)
-                 print(f'accuracy: {accuracy_score(predicted_labels, labeled_eval_split_subset["label"])}')
-                 metrics = compute_metrics(predictions)
-                 with open(f"{output_dir}predictions.pickle", "wb") as fp:
-                     pickle.dump(predictions.predictions.argmax(-1), fp)
-                 trainer.save_metrics("eval",predictions.metrics)
-                 trainer.save_model(output_dir)
-                 # Saves label conversion dictionary to output directory
-                 with open(f'{output_dir}/targets.txt', 'w') as f:
-                     f.write(str(target_dict_list))
-                 try:
-                     precision, recall, _ = precision_recall_curve(true_labels, predictions.predictions[:, 1])
-                     pr_auc = precision_auc(recall, precision)
-                     print(f'AUC: {pr_auc}')
-                     return recall, precision, pr_auc
-                 except:
-                     pass
-             else:
-                 # Optimizes hyperparameters
-                 num_classes = len(list(set(labeled_train_split['label'])))
-                 def model_init():
-                     model = BertForSequenceClassification.from_pretrained(pretrained_model,
-                                                          num_labels=num_classes,
-                                                          output_attentions = False,
-                                                          output_hidden_states = False)
-                     if freeze_layers is not None:
-                         modules_to_freeze = model.bert.encoder.layer[:freeze_layers]
-                         for module in modules_to_freeze:
-                             for param in module.parameters():
-                                 param.requires_grad = False
-                     model = model.to(device)
-                     return model
-                 # create the trainer
-                 trainer = Trainer(
-                   model_init=model_init,
-                   args=training_args_init,
-                   data_collator=DataCollatorForCellClassification(),
-                   train_dataset=labeled_train_split,
-                   eval_dataset=labeled_eval_split_subset,
-                   compute_metrics=compute_metrics
-                 )
-                 # specify raytune hyperparameter search space
-                 ray_config = {
-                     "num_train_epochs": tune.choice([epochs]),
-                     "learning_rate": tune.loguniform(1e-6, 1e-3),
-                     "weight_decay": tune.uniform(0.0, 0.3),
-                     "lr_scheduler_type": tune.choice(["linear","cosine","polynomial"]),
-                     "warmup_steps": tune.uniform(100, 2000),
-                     "seed": tune.uniform(0,100),
-                     "per_device_train_batch_size": tune.choice([geneformer_batch_size])
-                 }
-                 hyperopt_search = HyperOptSearch(
-                     metric="eval_accuracy", mode="max")
-                 if torch.device == 'cuda':
-                     resources_per_trial={"cpu":8,"gpu":1},
-                 else:
-                     resources_per_trial={"cpu":8}
-                 # optimize hyperparameters
-                 best_trial = trainer.hyperparameter_search(
-                     direction="maximize",
-                     backend="ray",
-                     resources_per_trial = resources_per_trial,
-                     hp_space=lambda _: ray_config,
-                     search_alg=hyperopt_search,
-                     n_trials=10, # number of trials
-                     progress_reporter=tune.CLIReporter(max_report_frequency=600,
-                                                                    sort_by_metric=True,
-                                                                    max_progress_rows=100,
-                                                                    mode="max",
-                                                                    metric="eval_accuracy",
-                                                                    metric_columns=["loss", "eval_loss", "eval_accuracy"]))
-                 best_hyperparameters = best_trial.hyperparameters
-                 print("Best Hyperparameters:")
-                 print(best_hyperparameters)
     # Performs Inference with model
-    if inference == True:
-        if dataset_split != None and data_filter != None:
             def if_label(example):
-                return example[dataset_split] == data_filter
             train_dataset = train_dataset.filter(if_label, num_proc=cpu_cores)
         trainset_label_shuffled = train_dataset
         total_cells = len(trainset_label_shuffled)
         # loads dictionary of all cell labels model was trained on
-        with open(Path(model_location) / 'targets.txt', 'r') as f:
             data = ast.literal_eval(f.read())
-        if dataset_split != None and data_filter == None:
             indexer = dataset_split.index(data_filter)
             data = data[indexer]
-        target_dict_list = {key:value for key, value in enumerate(data)}
         # set logging steps
-        logging_steps = round(len(trainset_label_shuffled)/geneformer_batch_size/20)
-        # reload pretrained model
         input_ids = trainset_label_shuffled["input_ids"]
         inputs = torch.zeros(len(input_ids), max_input_size, dtype=torch.int64)
         attention = torch.zeros(len(input_ids), max_input_size, dtype=torch.int64)
         for i, sentence in enumerate(input_ids):
             sentence_length = len(sentence)
             if sentence_length <= max_input_size:
@@ -766,59 +765,77 @@ def finetune_cells(token_set = Path('geneformer/token_dictionary.pkl'), median_s
             else:
                 inputs[i, :] = torch.tensor(sentence[:max_input_size])
                 attention[i, :] = torch.ones(max_input_size)
-        model = BertForSequenceClassification.from_pretrained(model_location, num_labels=len(target_dict_list)).to(device)
-        model_outputs = model(inputs.to(device), attention_mask = attention)["logits"]
-        predictions = F.softmax(model_outputs, dim = -1).argmax(-1)
         predictions = [target_dict_list[int(pred)] for pred in predictions]
         return predictions
-    # Extracts embeddings from labelled data
-    if emb_extract == True:
-        if emb_filter == None:
-            with open(f'{model_location}/targets.txt', 'r') as f:
                 data = ast.literal_eval(f.read())
-            if dataset_split != None and data_filter == None:
                 indexer = dataset_split.index(data_filter)
                 data = data[indexer]
-            target_dict_list = {key:value for key, value in enumerate(data)}
             total_filter = None
         else:
             total_filter = emb_filter
         train_dataset = load_from_disk(dataset)
-        if dataset_split != None:
             def if_label(example):
-                return example[dataset_split] == data_filter
             train_dataset = train_dataset.filter(if_label, num_proc=cpu_cores)
             label_counter = Counter(train_dataset[label])
             total_cells = sum(label_counter.values())
-            cells_to_keep = [k for k,v in label_counter.items() if v>(filter_cells*total_cells)]
             def if_not_rare(example):
-                  return example[label] in cells_to_keep
             train_dataset = train_dataset.filter(if_not_rare, num_proc=cpu_cores)
         true_labels = train_dataset[label]
         num_classes = len(list(set(true_labels)))
-        embex = EmbExtractor(model_type="CellClassifier", num_classes=num_classes,
-                             filter_data=total_filter, max_ncells=max_cells, emb_layer=emb_layer,
-                             emb_label=[dataset_split,label], labels_to_plot=[label], forward_batch_size=forward_batch, nproc=cpu_cores)
         # example dataset: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset
-        subprocess.call(f'mkdir -p {emb_dir}', shell = True)
-        embs = embex.extract_embs(model_directory = model_location, input_data_file = dataset, output_directory = emb_dir, output_prefix = f"{label}_embbeddings")
         true_labels = embex.filtered_input_data[label]
-        emb_dict = {label:[] for label in list(set(true_labels))}
         for num, emb in embs.iterrows():
             key = emb[label]
             selection = emb.iloc[:255]
@@ -826,36 +843,32 @@ def finetune_cells(token_set = Path('geneformer/token_dictionary.pkl'), median_s
             emb_dict[key].append(emb)
         for key in list(emb_dict.keys()):
-            stack = torch.stack(emb_dict[key], dim = 0)
             emb_dict[key] = torch.mean(stack, dim=0)
-        similarities = {key:{} for key in list(emb_dict.keys())}
         for key in list(emb_dict.keys()):
             remaining_keys = [k for k in list(emb_dict.keys()) if k != key]
             for k in remaining_keys:
                 embedding = emb_dict[k]
-                sim = similarity(emb_dict[key], embedding, cosine = True)
                 similarities[key][k] = sim
         plot_similarity_heatmap(similarities)
-        embex.plot_embs(embs=embs,
-                plot_style="umap",
-                output_directory=emb_dir,
-                output_prefix="emb_plot")
-        embex.plot_embs(embs=embs,
-                plot_style="heatmap",
-                output_directory=emb_dir,
-                output_prefix="emb_plot")
-        return similarities
-if __name__ == '__main__':
-    predictions =  finetune_cells(skip_training = False, dataset_split = None, label = "disease", sample_data = .5, data_filter = 'hcm', epochs = 10, output_dir = 'hcm_model', model_location = 'hcm_model',
-                            emb_extract = True, geneformer_batch_size = 12, inference = False, dataset = "/work/ccnr/GeneFormer/GeneFormer_repo/Genecorpus-30M/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset/")

+"""
+Geneformer cell classifier.
+Usage:
+    from geneformer import classify_cells
+    classify_cells(
+        token_set=Path("geneformer/token_dictionary.pkl"),
+        median_set=Path("geneformer/gene_median_dictionary.pkl"),
+        pretrained_model=".",
+        dataset="Genecorpus-30M/example_input_files/cell_classification/cell_type_annotation/cell_type_train_data.dataset/",
+        dataset_split=None,
+        filter_cells=0.005,
+        epochs=1,
+        cpu_cores=os.cpu_count(),
+        geneformer_batch_size=12,
+        optimizer="adamw",
+        max_lr=5e-5,
+        num_gpus=torch.cuda.device_count(),
+        max_input_size=2**11,
+        lr_schedule_fn="linear",
+        warmup_steps=500,
+        freeze_layers=0,
+        emb_extract=False,
+        max_cells=1000,
+        emb_layer=0,
+        emb_filter=None,
+        emb_dir="embeddings",
+        overwrite=True,
+        label="cell_type",
+        data_filter=None,
+        forward_batch=200,
+        model_location=None,
+        skip_training=False,
+        sample_data=1,
+        inference=False,
+        optimize_hyperparameters=False,
+        output_dir=None,
+    )
+"""
+import ast
+import datetime
 import os
+import pickle
 import random
 import subprocess
+from collections import Counter
+from pathlib import Path
+import numpy as np
+import seaborn as sns
 import torch
 import torch.nn.functional as F
+from datasets import load_from_disk
+from matplotlib import pyplot as plt
 from ray import tune
 from ray.tune.search.hyperopt import HyperOptSearch
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import auc as precision_auc
+from sklearn.metrics import f1_score, precision_recall_curve, roc_auc_score, roc_curve
+from transformers import BertForSequenceClassification, Trainer
+from transformers.training_args import TrainingArguments
+from geneformer import DataCollatorForCellClassification, EmbExtractor
+sns.set()
 # Properly sets up NCCV environment
+GPU_NUMBER = [i for i in range(torch.cuda.device_count())]
 os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(s) for s in GPU_NUMBER])
 os.environ["NCCL_DEBUG"] = "INFO"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Function for generating an ROC curve from data
+def ROC(prediction, truth, type="GeneFormer", label=""):
     fpr, tpr, _ = roc_curve(truth, prediction[:, 1])
     auc = roc_auc_score(truth, prediction[:, 1])
+    print(f"{type} AUC: {auc}")
+    plt.plot(fpr, tpr, label="AUC=" + str(auc))
+    plt.ylabel("True Positive Rate")
+    plt.xlabel("False Positive Rate")
+    plt.title(f"{label} ROC Curve")
     plt.legend(loc=4)
+    plt.savefig("ROC.png")
+    return tpr, fpr, auc
 # Identifies cosine similarity between two embeddings. 0 is perfectly dissimilar and 1 is perfectly similar
+def similarity(tensor1, tensor2, cosine=False):
+    if cosine is False:
         if tensor1.ndimension() > 1:
             tensor1 = tensor1.view(1, -1)
         if tensor2.ndimension() > 1:
         norm_tensor2 = torch.norm(tensor2)
         epsilon = 1e-8
         similarity = dot_product / (norm_tensor1 * norm_tensor2 + epsilon)
+        similarity = (similarity.item() + 1) / 2
     else:
         if tensor1.shape != tensor2.shape:
             raise ValueError("Input tensors must have the same shape.")
         dot_product = torch.dot(tensor1, tensor2)
         norm_tensor1 = torch.norm(tensor1)
         norm_tensor2 = torch.norm(tensor2)
         # Avoid division by zero by adding a small epsilon
         epsilon = 1e-8
         similarity = dot_product / (norm_tensor1 * norm_tensor2 + epsilon)
     return similarity.item()
 # Plots heatmap between different classes/labels
 def plot_similarity_heatmap(similarities):
     classes = list(similarities.keys())
             else:
                 val = similarities[c][cc]
             arr[i][j] = val
     plt.figure(figsize=(8, 6))
+    plt.imshow(arr, cmap="inferno", vmin=0, vmax=1)
     plt.colorbar()
+    plt.xticks(np.arange(classlen), classes, rotation=45, ha="right")
     plt.yticks(np.arange(classlen), classes)
     plt.title("Similarity Heatmap")
     plt.savefig("similarity_heatmap.png")
+def classify_cells(
+    token_set=Path("./token_dictionary.pkl"),
+    median_set=Path("./gene_median_dictionary.pkl"),
+    pretrained_model="../",
+    dataset="Genecorpus-30M/example_input_files/cell_classification/cell_type_annotation/cell_type_train_data.dataset/",
+    dataset_split=None,
+    filter_cells=0.005,
+    epochs=1,
+    cpu_cores=os.cpu_count(),
+    training_batch_size=12,
+    optimizer="adamw",
+    max_lr=5e-5,
+    num_gpus=torch.cuda.device_count(),
+    max_input_size=2**11,
+    lr_schedule_fn="linear",
+    warmup_steps=500,
+    freeze_layers=0,
+    emb_extract=False,
+    max_cells=None,
+    emb_layer=-1,
+    emb_filter=None,
+    emb_dir="embeddings",
+    overwrite=False,
+    label="cell_type",
+    data_filter=None,
+    inference_batch_size=200,
+    finetuned_model=None,
+    skip_training=False,
+    sample_data=1,
+    inference=False,
+    optimize_hyperparameters=True,
+    output_dir=None,
+):
+    """
     Primary Parameters
     -------------------
     dataset: path
+        Path to fine-tuning dataset for training
+    finetuned_model: path
+        Path to location of fine-tuned model to use for inference and embedding extraction
     pretrained_model: path
+        Path to pretrained Geneformer model
+    inference: bool
+        Indicates whether to perform inference and return a list of similarities. Defaults to False.
+    skip_training: bool
+        Indicates whether to skip training the model. Defaults to False.
     emb_extract: bool
+        Indicates whether to extract embeddings and calculate similarities. Defaults to True.
     optimize_hyperparameters: bool
+        Indicates whether to optimize model hyperparamters. Defaults to False.
     Customization Parameters
     -------------------
     dataset_split: str
+        Indicates how the dataset should be partitioned (if at all), and what ID should be used for partitioning
     data_filter: list
+        (For embeddings and inference) Runs analysis on subsets of the dataset based on the ID defined by dataset_split
     label: str
+        Feature to read as a classification label.
     emb_layer: int
+        What layer embeddings should be extracted and compared.
     emb_filter: ['cell1', 'cell2'...]
         Allows user to narrow down range of cells that embeddings will be extracted from.
     max_cells: int
+        Max number of cells to use for embedding extraction.
     freeze_layers: int
+        Number of layers that should be frozen during fine-tuning.
     sample_data: float
+        Proportion of the dataset that should be used.
+    """
     dataset_list = []
     evalset_list = []
     split_list = []
     target_dict_list = []
     train_dataset = load_from_disk(dataset)
     num_samples = int(len(train_dataset) * sample_data)
     random_indices = random.sample(range(len(train_dataset)), num_samples)
     train_dataset = train_dataset.select(random_indices)
+    sample = int(sample_data * len(train_dataset))
     sample_indices = random.sample(range(len(train_dataset)), sample)
     train_dataset = train_dataset.select(sample_indices)
+    def if_not_rare_cell_state(example):
         return example[label] in cells_to_keep
     # change labels to numerical ids
     def classes_to_ids(example):
         example["label"] = target_name_id_dict[example["label"]]
         return example
     def if_trained_label(example):
+        return example["label"] in trained_labels
+    if skip_training is not True:
         def compute_metrics(pred):
             labels = pred.label_ids
             preds = pred.predictions.argmax(-1)
             # calculate accuracy and macro f1 using sklearn's function
             acc = accuracy_score(labels, preds)
+            macro_f1 = f1_score(labels, preds, average="macro")
+            return {"accuracy": acc, "macro_f1": macro_f1}
         # Defines custom exceptions for collecting labels (default excluded)
+        excep = {"bone_marrow": "immune"}
+        if dataset_split is not None:
+            if data_filter is not None:
+                split_iter = [data_filter]
             else:
+                split_iter = Counter(train_dataset[dataset_split]).keys()
             for lab in split_iter:
                 # collect list of tissues for fine-tuning (immune and bone marrow are included together)
                 if lab in list(excep.keys()):
                     continue
                 elif lab == list(excep.values()):
+                    split_ids = [excep.keys(), excep.values()]
                     split_list += [excep.values()]
                 else:
                     split_ids = [lab]
                     split_list += [lab]
                 # filter datasets for given organ
                 def if_label(example):
                     return example[dataset_split] == lab
                 trainset_label = train_dataset.filter(if_label, num_proc=cpu_cores)
                 label_counter = Counter(trainset_label[label])
                 total_cells = sum(label_counter.values())
+                # excludes cells with a low proportion in the dataset
+                cells_to_keep = [
+                    k
+                    for k, v in label_counter.items()
+                    if v > (filter_cells * total_cells)
+                ]
+                trainset_label_subset = trainset_label.filter(
+                    if_not_rare_cell_state, num_proc=cpu_cores
+                )
                 # shuffle datasets and rename columns
                 trainset_label_shuffled = trainset_label_subset.shuffle(seed=42)
+                trainset_label_shuffled = trainset_label_shuffled.rename_column(
+                    label, "label"
+                )
+                trainset_label_shuffled = trainset_label_shuffled.remove_columns(
+                    dataset_split
+                )
                 # create dictionary of cell types : label ids
                 target_names = list(Counter(trainset_label_shuffled["label"]).keys())
+                target_name_id_dict = dict(
+                    zip(target_names, [i for i in range(len(target_names))])
+                )
                 target_dict_list += [target_name_id_dict]
+                labeled_trainset = trainset_label_shuffled.map(
+                    classes_to_ids, num_proc=cpu_cores
+                )
                 # create 80/20 train/eval splits
+                labeled_train_split = trainset_label_shuffled.select(
+                    [i for i in range(0, round(len(labeled_trainset) * 0.8))]
+                )
+                labeled_eval_split = trainset_label_shuffled.select(
+                    [
+                        i
+                        for i in range(
+                            round(len(labeled_trainset) * 0.8), len(labeled_trainset)
+                        )
+                    ]
+                )
                 # filter dataset for cell types in corresponding training set
                 trained_labels = list(Counter(labeled_train_split["label"]).keys())
+                labeled_eval_split_subset = labeled_eval_split.filter(
+                    if_trained_label, num_proc=cpu_cores
+                )
                 dataset_list += [labeled_train_split]
                 evalset_list += [labeled_eval_split_subset]
+            trainset_dict = dict(zip(split_list, dataset_list))
+            traintargetdict_dict = dict(zip(split_list, target_dict_list))
+            evalset_dict = dict(zip(split_list, evalset_list))
             for lab in split_list:
                 label_trainset = trainset_dict[lab]
                 label_evalset = evalset_dict[lab]
                 label_dict = traintargetdict_dict[lab]
                 # set logging steps
+                logging_steps = round(len(label_trainset) / training_batch_size / 10)
                 if logging_steps == 0:
                     logging_steps = 1
+                # load pretrained model
+                model = BertForSequenceClassification.from_pretrained(
+                    pretrained_model,
+                    num_labels=len(label_dict.keys()),
+                    output_attentions=False,
+                    output_hidden_states=False,
+                ).to(device)
                 # define output directory path
                 current_date = datetime.datetime.now()
                 datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
+                if output_dir is None:
+                    output_dir = f"{datestamp}_geneformer_CellClassifier_{lab}_L{max_input_size}_B{training_batch_size}_LR{max_lr}_LS{lr_schedule_fn}_WU{warmup_steps}_E{epochs}_O{optimizer}_F{freeze_layers}/"
                 # ensure not overwriting previously saved model
+                saved_model_test = os.path.join(output_dir, "pytorch_model.bin")
+                if os.path.isfile(saved_model_test) is True and overwrite is False:
                     raise Exception("Model already saved to this directory.")
                 # make output directory
+                subprocess.call(f"mkdir -p {output_dir}", shell=True)
                 # set training arguments
                 training_args = {
                     "learning_rate": max_lr,
                     "lr_scheduler_type": lr_schedule_fn,
                     "warmup_steps": warmup_steps,
                     "weight_decay": 0.001,
+                    "per_device_train_batch_size": training_batch_size,
+                    "per_device_eval_batch_size": training_batch_size,
                     "num_train_epochs": epochs,
                     "load_best_model_at_end": True,
                     "output_dir": output_dir,
                 }
                 training_args_init = TrainingArguments(**training_args)
+                true_labels = label_evalset["label"]
+                if optimize_hyperparameters is False:
                     # create the trainer
                     trainer = Trainer(
+                        model=model,
+                        args=training_args_init,
+                        data_collator=DataCollatorForCellClassification(),
+                        train_dataset=label_trainset,
+                        eval_dataset=label_evalset,
+                        compute_metrics=compute_metrics,
                     )
                     # train the cell type classifier
                     trainer.train()
                     predictions = trainer.predict(label_evalset)
+                    print(
+                        f'accuracy: {accuracy_score(predictions.argmax(), label_evalset["labels"])}'
+                    )
                     tpr, fpr, auc = ROC(predictions.predictions, true_labels)
                     metrics = compute_metrics(predictions)
                     with open(f"{output_dir}predictions.pickle", "wb") as fp:
                         pickle.dump(predictions, fp)
+                    trainer.save_metrics("eval", predictions.metrics)
+                    with open(f"{output_dir}/targets.txt", "w") as f:
                         if len(target_dict_list) == 1:
                             f.write(str(target_dict_list[0]))
                         else:
                             f.write(str(target_dict_list))
+                    try:
+                        precision, recall, _ = precision_recall_curve(
+                            true_labels, predictions.predictions[:, 1]
+                        )
+                        pr_auc = precision_auc(recall, precision)
+                        print(f"AUC: {pr_auc}")
+                        return recall, precision, pr_auc
+                    except:
+                        pass
                     trainer.save_model(output_dir)
                 else:
                     def model_init():
+                        model = BertForSequenceClassification.from_pretrained(
+                            pretrained_model,
+                            num_labels=len(label_dict.keys()),
+                            output_attentions=False,
+                            output_hidden_states=False,
+                        )
                         if freeze_layers is not None:
                             modules_to_freeze = model.bert.encoder.layer[:freeze_layers]
                             for module in modules_to_freeze:
                                 for param in module.parameters():
                                     param.requires_grad = False
+                        model = model.to(device)
                         return model
                     trainer = Trainer(
                         model_init=model_init,
                         args=training_args_init,
                         data_collator=DataCollatorForCellClassification(),
                         train_dataset=label_trainset,
                         eval_dataset=label_evalset,
+                        compute_metrics=compute_metrics,
                     )
                     # specify raytune hyperparameter search space
                     ray_config = {
                         "num_train_epochs": tune.choice([epochs]),
                         "learning_rate": tune.loguniform(1e-6, 1e-3),
                         "weight_decay": tune.uniform(0.0, 0.3),
+                        "lr_scheduler_type": tune.choice(
+                            ["linear", "cosine", "polynomial"]
+                        ),
                         "warmup_steps": tune.uniform(100, 2000),
+                        "seed": tune.uniform(0, 100),
+                        "per_device_train_batch_size": tune.choice(
+                            [training_batch_size]
+                        ),
                     }
+                    hyperopt_search = HyperOptSearch(metric="eval_accuracy", mode="max")
+                    if torch.device == "cuda":
+                        resources_per_trial = ({"cpu": 8, "gpu": 1},)
                     else:
+                        resources_per_trial = {"cpu": 8}
                     # optimize hyperparameters
                     best_trial = trainer.hyperparameter_search(
                         direction="maximize",
                         backend="ray",
+                        resources_per_trial=resources_per_trial,
                         hp_space=lambda _: ray_config,
                         search_alg=hyperopt_search,
+                        n_trials=100,  # number of trials
+                        progress_reporter=tune.CLIReporter(
+                            max_report_frequency=600,
+                            sort_by_metric=True,
+                            max_progress_rows=100,
+                            mode="max",
+                            metric="eval_accuracy",
+                            metric_columns=["loss", "eval_loss", "eval_accuracy"],
+                        ),
+                    )
                     best_hyperparameters = best_trial.hyperparameters
                     print("Best Hyperparameters:")
                     print(best_hyperparameters)
         else:
+            trainset_label = train_dataset
+            label_counter = Counter(trainset_label[label])
+            total_cells = sum(label_counter.values())
+            # Excludes cells with a low proportion in the dataset
+            cells_to_keep = [
+                k for k, v in label_counter.items() if v > (filter_cells * total_cells)
+            ]
+            trainset_label_subset = trainset_label.filter(
+                if_not_rare_cell_state, num_proc=cpu_cores
+            )
+            # shuffle datasets and rename columns
+            trainset_label_shuffled = trainset_label_subset.shuffle(seed=42)
+            trainset_label_shuffled = trainset_label_shuffled.rename_column(
+                label, "label"
+            )
+            # create dictionary of cell types : label ids
+            target_names = list(Counter(trainset_label_shuffled["label"]).keys())
+            target_name_id_dict = dict(
+                zip(target_names, [i for i in range(len(target_names))])
+            )
+            target_dict_list = target_name_id_dict
+            labeled_trainset = trainset_label_shuffled.map(
+                classes_to_ids, num_proc=cpu_cores
+            )
+            # create 80/20 train/eval splits
+            labeled_train_split = labeled_trainset.select(
+                [i for i in range(0, round(len(labeled_trainset) * 0.8))]
+            )
+            labeled_eval_split = labeled_trainset.select(
+                [
+                    i
+                    for i in range(
+                        round(len(labeled_trainset) * 0.8), len(labeled_trainset)
+                    )
+                ]
+            )
+            # filter dataset for cell types in corresponding training set
+            trained_labels = list(Counter(labeled_train_split["label"]).keys())
+            labeled_eval_split_subset = labeled_eval_split.filter(
+                if_trained_label, num_proc=cpu_cores
+            )
+            # set logging steps
+            logging_steps = round(len(trainset_label) / training_batch_size / 10)
+            # load pretrained model
+            model = BertForSequenceClassification.from_pretrained(
+                pretrained_model,
+                num_labels=len(target_dict_list.keys()),
+                output_attentions=False,
+                output_hidden_states=False,
+            ).to(device)
+            # define output directory path
+            current_date = datetime.datetime.now()
+            datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
+            if output_dir is None:
+                output_dir = f"{datestamp}_geneformer_CellClassifier_L{max_input_size}_B{training_batch_size}_LR{max_lr}_LS{lr_schedule_fn}_WU{warmup_steps}_E{epochs}_O{optimizer}_F{freeze_layers}/"
+            # ensure not overwriting previously saved model
+            saved_model_test = os.path.join(output_dir, "pytorch_model.bin")
+            if os.path.isfile(saved_model_test) is True and overwrite is False:
+                raise Exception("Model already saved to this directory.")
+            # make output directory
+            subprocess.call(f"mkdir -p {output_dir}", shell=True)
+            # set training arguments
+            training_args = {
+                "learning_rate": max_lr,
+                "do_train": True,
+                "do_eval": True,
+                "evaluation_strategy": "epoch",
+                "save_strategy": "epoch",
+                "logging_steps": logging_steps,
+                "group_by_length": True,
+                "length_column_name": "length",
+                "disable_tqdm": False,
+                "lr_scheduler_type": lr_schedule_fn,
+                "warmup_steps": warmup_steps,
+                "weight_decay": 0.001,
+                "per_device_train_batch_size": training_batch_size,
+                "per_device_eval_batch_size": training_batch_size,
+                "num_train_epochs": epochs,
+                "load_best_model_at_end": True,
+                "output_dir": output_dir,
+            }
+            training_args_init = TrainingArguments(**training_args)
+            true_labels = labeled_eval_split_subset["label"]
+            if optimize_hyperparameters is False:
+                # create the trainer
+                trainer = Trainer(
+                    model=model,
+                    args=training_args_init,
+                    data_collator=DataCollatorForCellClassification(),
+                    train_dataset=labeled_train_split,
+                    eval_dataset=labeled_eval_split_subset,
+                    compute_metrics=compute_metrics,
+                )
+                # train the cell type classifier
+                trainer.train()
+                predictions = trainer.predict(labeled_eval_split_subset)
+                predictions_tensor = torch.Tensor(predictions.predictions)
+                predicted_labels = torch.argmax(predictions_tensor, dim=1)
+                print(
+                    f'accuracy: {accuracy_score(predicted_labels, labeled_eval_split_subset["label"])}'
+                )
+                metrics = compute_metrics(predictions)
+                with open(f"{output_dir}predictions.pickle", "wb") as fp:
+                    pickle.dump(predictions.predictions.argmax(-1), fp)
+                trainer.save_metrics("eval", predictions.metrics)
+                trainer.save_model(output_dir)
+                # Saves label conversion dictionary to output directory
+                with open(f"{output_dir}/targets.txt", "w") as f:
+                    f.write(str(target_dict_list))
+                try:
+                    precision, recall, _ = precision_recall_curve(
+                        true_labels, predictions.predictions[:, 1]
+                    )
+                    pr_auc = precision_auc(recall, precision)
+                    print(f"AUC: {pr_auc}")
+                    return recall, precision, pr_auc
+                except:
+                    pass
+            else:
+                # Optimizes hyperparameters
+                num_classes = len(list(set(labeled_train_split["label"])))
+                def model_init():
+                    model = BertForSequenceClassification.from_pretrained(
+                        pretrained_model,
+                        num_labels=num_classes,
+                        output_attentions=False,
+                        output_hidden_states=False,
+                    )
+                    if freeze_layers is not None:
+                        modules_to_freeze = model.bert.encoder.layer[:freeze_layers]
+                        for module in modules_to_freeze:
+                            for param in module.parameters():
+                                param.requires_grad = False
+                    model = model.to(device)
+                    return model
+                # create the trainer
+                trainer = Trainer(
+                    model_init=model_init,
+                    args=training_args_init,
+                    data_collator=DataCollatorForCellClassification(),
+                    train_dataset=labeled_train_split,
+                    eval_dataset=labeled_eval_split_subset,
+                    compute_metrics=compute_metrics,
+                )
+                # specify raytune hyperparameter search space
+                ray_config = {
+                    "num_train_epochs": tune.choice([epochs]),
+                    "learning_rate": tune.loguniform(1e-6, 1e-3),
+                    "weight_decay": tune.uniform(0.0, 0.3),
+                    "lr_scheduler_type": tune.choice(
+                        ["linear", "cosine", "polynomial"]
+                    ),
+                    "warmup_steps": tune.uniform(100, 2000),
+                    "seed": tune.uniform(0, 100),
+                    "per_device_train_batch_size": tune.choice([training_batch_size]),
+                }
+                hyperopt_search = HyperOptSearch(metric="eval_accuracy", mode="max")
+                if torch.device == "cuda":
+                    resources_per_trial = ({"cpu": 8, "gpu": 1},)
+                else:
+                    resources_per_trial = {"cpu": 8}
+                # optimize hyperparameters
+                best_trial = trainer.hyperparameter_search(
+                    direction="maximize",
+                    backend="ray",
+                    resources_per_trial=resources_per_trial,
+                    hp_space=lambda _: ray_config,
+                    search_alg=hyperopt_search,
+                    n_trials=100,  # number of trials
+                    progress_reporter=tune.CLIReporter(
+                        max_report_frequency=600,
+                        sort_by_metric=True,
+                        max_progress_rows=100,
+                        mode="max",
+                        metric="eval_accuracy",
+                        metric_columns=["loss", "eval_loss", "eval_accuracy"],
+                    ),
+                )
+                best_hyperparameters = best_trial.hyperparameters
+                print("Best Hyperparameters:")
+                print(best_hyperparameters)
     # Performs Inference with model
+    if inference is True:
+        if dataset_split is not None and data_filter is not None:
             def if_label(example):
+                return example[dataset_split] == data_filter
             train_dataset = train_dataset.filter(if_label, num_proc=cpu_cores)
         trainset_label_shuffled = train_dataset
         total_cells = len(trainset_label_shuffled)
         # loads dictionary of all cell labels model was trained on
+        with open(Path(finetuned_model) / "targets.txt", "r") as f:
             data = ast.literal_eval(f.read())
+        if dataset_split is not None and data_filter is None:
             indexer = dataset_split.index(data_filter)
             data = data[indexer]
+        target_dict_list = {key: value for key, value in enumerate(data)}
         # set logging steps
+        logging_steps = round(len(trainset_label_shuffled) / training_batch_size / 20)
+        # load pretrained model
         input_ids = trainset_label_shuffled["input_ids"]
         inputs = torch.zeros(len(input_ids), max_input_size, dtype=torch.int64)
         attention = torch.zeros(len(input_ids), max_input_size, dtype=torch.int64)
         for i, sentence in enumerate(input_ids):
             sentence_length = len(sentence)
             if sentence_length <= max_input_size:
             else:
                 inputs[i, :] = torch.tensor(sentence[:max_input_size])
                 attention[i, :] = torch.ones(max_input_size)
+        model = BertForSequenceClassification.from_pretrained(
+            finetuned_model, num_labels=len(target_dict_list)
+        ).to(device)
+        model_outputs = model(inputs.to(device), attention_mask=attention)["logits"]
+        predictions = F.softmax(model_outputs, dim=-1).argmax(-1)
         predictions = [target_dict_list[int(pred)] for pred in predictions]
         return predictions
+    # Extracts embeddings from labeled data
+    if emb_extract is True:
+        if emb_filter is None:
+            with open(f"{finetuned_model}/targets.txt", "r") as f:
                 data = ast.literal_eval(f.read())
+            if dataset_split is not None and data_filter is None:
                 indexer = dataset_split.index(data_filter)
                 data = data[indexer]
+            target_dict_list = {key: value for key, value in enumerate(data)}
             total_filter = None
         else:
             total_filter = emb_filter
         train_dataset = load_from_disk(dataset)
+        if dataset_split is not None:
             def if_label(example):
+                return example[dataset_split] == data_filter
             train_dataset = train_dataset.filter(if_label, num_proc=cpu_cores)
             label_counter = Counter(train_dataset[label])
             total_cells = sum(label_counter.values())
+            cells_to_keep = [
+                k for k, v in label_counter.items() if v > (filter_cells * total_cells)
+            ]
             def if_not_rare(example):
+                return example[label] in cells_to_keep
             train_dataset = train_dataset.filter(if_not_rare, num_proc=cpu_cores)
         true_labels = train_dataset[label]
         num_classes = len(list(set(true_labels)))
+        embex = EmbExtractor(
+            model_type="CellClassifier",
+            num_classes=num_classes,
+            filter_data=total_filter,
+            max_ncells=max_cells,
+            emb_layer=emb_layer,
+            emb_label=[dataset_split, label],
+            labels_to_plot=[label],
+            forward_batch_size=inference_batch_size,
+            nproc=cpu_cores,
+        )
         # example dataset: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset
+        subprocess.call(f"mkdir -p {emb_dir}", shell=True)
+        embs = embex.extract_embs(
+            model_directory=finetuned_model,
+            input_data_file=dataset,
+            output_directory=emb_dir,
+            output_prefix=f"{label}_embeddings",
+        )
         true_labels = embex.filtered_input_data[label]
+        emb_dict = {label: [] for label in list(set(true_labels))}
         for num, emb in embs.iterrows():
             key = emb[label]
             selection = emb.iloc[:255]
             emb_dict[key].append(emb)
         for key in list(emb_dict.keys()):
+            stack = torch.stack(emb_dict[key], dim=0)
             emb_dict[key] = torch.mean(stack, dim=0)
+        similarities = {key: {} for key in list(emb_dict.keys())}
         for key in list(emb_dict.keys()):
             remaining_keys = [k for k in list(emb_dict.keys()) if k != key]
             for k in remaining_keys:
                 embedding = emb_dict[k]
+                sim = similarity(emb_dict[key], embedding, cosine=True)
                 similarities[key][k] = sim
         plot_similarity_heatmap(similarities)
+        embex.plot_embs(
+            embs=embs,
+            plot_style="umap",
+            output_directory=emb_dir,
+            output_prefix="emb_plot",
+        )
+        embex.plot_embs(
+            embs=embs,
+            plot_style="heatmap",
+            output_directory=emb_dir,
+            output_prefix="emb_plot",
+        )
+        return similarities

Gene_classifier.py → geneformer/gene_classifier.py RENAMED Viewed

@@ -1,37 +1,47 @@
 import os
 import sys
-GPU_NUMBER = [0] # CHANGE WITH MULTIGPU
 os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(s) for s in GPU_NUMBER])
 os.environ["NCCL_DEBUG"] = "INFO"
-# imports
-from sklearn.model_selection import train_test_split
 import datetime
 import subprocess
 from pathlib import Path
-import math
 import matplotlib.pyplot as plt
 import numpy as np
-import pickle
 import pandas as pd
-from datasets import load_from_disk, Dataset
-from sklearn import preprocessing
-from sklearn.metrics import accuracy_score, auc, confusion_matrix, ConfusionMatrixDisplay, roc_curve
-from sklearn.model_selection import StratifiedKFold
 import torch
-from transformers import BertForTokenClassification
-from transformers import Trainer
-from transformers.training_args import TrainingArguments
 from tqdm.notebook import tqdm
-from sklearn.metrics import roc_curve, roc_auc_score
 from geneformer import DataCollatorForGeneClassification, EmbExtractor
 from geneformer.pretrainer import token_dictionary
-import ast
-import torch.nn.functional as F
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 from geneformer import TranscriptomeTokenizer
-def vote(logit_pair):
     a, b = logit_pair
     if a > b:
         return 0
@@ -39,13 +49,16 @@ def vote(logit_pair):
         return 1
     elif a == b:
         return "tie"
 def py_softmax(vector):
-	e = np.exp(vector)
-	return e / e.sum()
- # Identifies cosine similarity between two embeddings. 0 is perfectly dissimilar and 1 is perfectly similar
-def similarity(tensor1, tensor2, cosine = True):
     if cosine == False:
         if tensor1.ndimension() > 1:
             tensor1 = tensor1.view(1, -1)
@@ -56,7 +69,7 @@ def similarity(tensor1, tensor2, cosine = True):
         norm_tensor2 = torch.norm(tensor2)
         epsilon = 1e-8
         similarity = dot_product / (norm_tensor1 * norm_tensor2 + epsilon)
-        similarity = (similarity.item() + 1)/2
     else:
         if tensor1.shape != tensor2.shape:
             raise ValueError("Input tensors must have the same shape.")
@@ -65,13 +78,14 @@ def similarity(tensor1, tensor2, cosine = True):
         dot_product = torch.dot(tensor1, tensor2)
         norm_tensor1 = torch.norm(tensor1)
         norm_tensor2 = torch.norm(tensor2)
         # Avoid division by zero by adding a small epsilon
         epsilon = 1e-8
         similarity = dot_product / (norm_tensor1 * norm_tensor2 + epsilon)
     return similarity.item()
 # Plots heatmap between different classes/labels
 def plot_similarity_heatmap(similarities):
     classes = list(similarities.keys())
@@ -84,100 +98,122 @@ def plot_similarity_heatmap(similarities):
             else:
                 val = similarities[c][cc]
             arr[i][j] = val
     plt.figure(figsize=(8, 6))
-    plt.imshow(arr, cmap='inferno', vmin=0, vmax=1)
     plt.colorbar()
-    plt.xticks(np.arange(classlen), classes, rotation = 45, ha = 'right')
     plt.yticks(np.arange(classlen), classes)
     plt.title("Similarity Heatmap")
     plt.savefig("similarity_heatmap.png")
 # get cross-validated mean and sd metrics
 def get_cross_valid_metrics(all_tpr, all_roc_auc, all_tpr_wt):
-    wts = [count/sum(all_tpr_wt) for count in all_tpr_wt]
-    all_weighted_tpr = [a*b for a,b in zip(all_tpr, wts)]
     mean_tpr = np.sum(all_weighted_tpr, axis=0)
     mean_tpr[-1] = 1.0
-    all_weighted_roc_auc = [a*b for a,b in zip(all_roc_auc, wts)]
     roc_auc = np.sum(all_weighted_roc_auc)
-    roc_auc_sd = math.sqrt(np.average((all_roc_auc-roc_auc)**2, weights=wts))
     return mean_tpr, roc_auc, roc_auc_sd
-def validate(data, targets, labels, nsplits, subsample_size, training_args, freeze_layers, output_dir, num_proc, num_labels, pre_model):
     # initiate eval metrics to return
     num_classes = len(set(labels))
     mean_fpr = np.linspace(0, 1, 100)
     # create 80/20 train/eval splits
-    targets_train, targets_eval, labels_train, labels_eval = train_test_split(targets, labels ,test_size=0.25, shuffle=True)
     label_dict_train = dict(zip(targets_train, labels_train))
     label_dict_eval = dict(zip(targets_eval, labels_eval))
     # function to filter by whether contains train or eval labels
     def if_contains_train_label(example):
         a = label_dict_train.keys()
-        b = example['input_ids']
         return not set(a).isdisjoint(b)
     def if_contains_eval_label(example):
         a = label_dict_eval.keys()
-        b = example['input_ids']
         return not set(a).isdisjoint(b)
     # filter dataset for examples containing classes for this split
     print(f"Filtering training data")
     trainset = data.filter(if_contains_train_label, num_proc=num_proc)
-    print(f"Filtered {round((1-len(trainset)/len(data))*100)}%; {len(trainset)} remain\n")
     print(f"Filtering evalation data")
     evalset = data.filter(if_contains_eval_label, num_proc=num_proc)
     print(f"Filtered {round((1-len(evalset)/len(data))*100)}%; {len(evalset)} remain\n")
     # minimize to smaller training sample
     training_size = min(subsample_size, len(trainset))
     trainset_min = trainset.select([i for i in range(training_size)])
     eval_size = min(training_size, len(evalset))
-    half_training_size = round(eval_size/2)
     evalset_train_min = evalset.select([i for i in range(half_training_size)])
     evalset_oos_min = evalset.select([i for i in range(half_training_size, eval_size)])
     # label conversion functions
     def generate_train_labels(example):
-        example["labels"] = [label_dict_train.get(token_id, -100) for token_id in example["input_ids"]]
         return example
     def generate_eval_labels(example):
-        example["labels"] = [label_dict_eval.get(token_id, -100) for token_id in example["input_ids"]]
         return example
-    # label datasets
     print(f"Labeling training data")
     trainset_labeled = trainset_min.map(generate_train_labels)
     print(f"Labeling evaluation data")
     evalset_train_labeled = evalset_train_min.map(generate_eval_labels)
     print(f"Labeling evaluation OOS data")
     evalset_oos_labeled = evalset_oos_min.map(generate_eval_labels)
     # load model
     model = BertForTokenClassification.from_pretrained(
-          pre_model,
-          num_labels=num_labels,
-          output_attentions = False,
-          output_hidden_states = False,
     )
     if freeze_layers is not None:
         modules_to_freeze = model.bert.encoder.layer[:freeze_layers]
         for module in modules_to_freeze:
             for param in module.parameters():
                 param.requires_grad = False
     model = model.to(device)
     # add output directory to training args and initiate
     training_args["output_dir"] = output_dir
     training_args_init = TrainingArguments(**training_args)
     # create the trainer
     trainer = Trainer(
         model=model,
@@ -186,25 +222,40 @@ def validate(data, targets, labels, nsplits, subsample_size, training_args, free
         train_dataset=trainset_labeled,
         eval_dataset=evalset_train_labeled,
     )
     # train the gene classifier
     trainer.train()
     trainer.save_model(output_dir)
-    fpr, tpr, interp_tpr, conf_mat = classifier_predict(trainer.model, evalset_oos_labeled, 200, mean_fpr)
     auc_score = auc(fpr, tpr)
     return fpr, tpr, auc_score
 # cross-validate gene classifier
-def cross_validate(data, targets, labels, nsplits, subsample_size, training_args, freeze_layers, output_dir, num_proc, num_labels, pre_model):
     # check if output directory already written to
     # ensure not overwriting previously saved model
     model_dir_test = os.path.join(output_dir, "ksplit0/models/pytorch_model.bin")
-    #if os.path.isfile(model_dir_test) == True:
     #    raise Exception("Model already saved to this directory.")
-    device = device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     # initiate eval metrics to return
     num_classes = len(set(labels))
     mean_fpr = np.linspace(0, 1, 100)
@@ -212,8 +263,8 @@ def cross_validate(data, targets, labels, nsplits, subsample_size, training_args
     all_roc_auc = []
     all_tpr_wt = []
     label_dicts = []
-    confusion = np.zeros((num_classes,num_classes))
     # set up cross-validation splits
     skf = StratifiedKFold(n_splits=nsplits, random_state=0, shuffle=True)
     # train and evaluate
@@ -223,236 +274,280 @@ def cross_validate(data, targets, labels, nsplits, subsample_size, training_args
             print("early stopping activated due to large # of training examples")
             if iteration_num == 3:
                 break
         print(f"****** Crossval split: {iteration_num}/{nsplits-1} ******\n")
         # generate cross-validation splits
         targets_train, targets_eval = targets[train_index], targets[eval_index]
         labels_train, labels_eval = labels[train_index], labels[eval_index]
         label_dict_train = dict(zip(targets_train, labels_train))
         label_dict_eval = dict(zip(targets_eval, labels_eval))
-        label_dicts += (iteration_num, targets_train, targets_eval, labels_train, labels_eval)
         # function to filter by whether contains train or eval labels
         def if_contains_train_label(example):
             a = label_dict_train.keys()
-            b = example['input_ids']
             return not set(a).isdisjoint(b)
         def if_contains_eval_label(example):
             a = label_dict_eval.keys()
-            b = example['input_ids']
             return not set(a).isdisjoint(b)
         # filter dataset for examples containing classes for this split
         print(f"Filtering training data")
         trainset = data.filter(if_contains_train_label, num_proc=num_proc)
-        print(f"Filtered {round((1-len(trainset)/len(data))*100)}%; {len(trainset)} remain\n")
         print(f"Filtering evalation data")
         evalset = data.filter(if_contains_eval_label, num_proc=num_proc)
-        print(f"Filtered {round((1-len(evalset)/len(data))*100)}%; {len(evalset)} remain\n")
         # minimize to smaller training sample
         training_size = min(subsample_size, len(trainset))
         trainset_min = trainset.select([i for i in range(training_size)])
         eval_size = min(training_size, len(evalset))
-        half_training_size = round(eval_size/2)
         evalset_train_min = evalset.select([i for i in range(half_training_size)])
-        evalset_oos_min = evalset.select([i for i in range(half_training_size, eval_size)])
         # label conversion functions
         def generate_train_labels(example):
-            example["labels"] = [label_dict_train.get(token_id, -100) for token_id in example["input_ids"]]
             return example
         def generate_eval_labels(example):
-            example["labels"] = [label_dict_eval.get(token_id, -100) for token_id in example["input_ids"]]
             return example
-        # label datasets
         print(f"Labeling training data")
         trainset_labeled = trainset_min.map(generate_train_labels)
         print(f"Labeling evaluation data")
         evalset_train_labeled = evalset_train_min.map(generate_eval_labels)
         print(f"Labeling evaluation OOS data")
         evalset_oos_labeled = evalset_oos_min.map(generate_eval_labels)
         # create output directories
         ksplit_output_dir = os.path.join(output_dir, f"ksplit{iteration_num}")
-        ksplit_model_dir = os.path.join(ksplit_output_dir, "models/")
         # ensure not overwriting previously saved model
         model_output_file = os.path.join(ksplit_model_dir, "pytorch_model.bin")
-        #if os.path.isfile(model_output_file) == True:
         #    raise Exception("Model already saved to this directory.")
         # make training and model output directories
-        subprocess.call(f'mkdir -p {ksplit_output_dir}', shell=True)
-        subprocess.call(f'mkdir -p {ksplit_model_dir}', shell=True)
         # load model
         model = BertForTokenClassification.from_pretrained(
             pre_model,
             num_labels=num_labels,
-            output_attentions = False,
-            output_hidden_states = False,
         )
         if freeze_layers is not None:
             modules_to_freeze = model.bert.encoder.layer[:freeze_layers]
             for module in modules_to_freeze:
                 for param in module.parameters():
                     param.requires_grad = False
         model = model.to(device)
         # add output directory to training args and initiate
         training_args["output_dir"] = ksplit_output_dir
         training_args_init = TrainingArguments(**training_args)
         # create the trainer
         trainer = Trainer(
             model=model,
             args=training_args_init,
             data_collator=DataCollatorForGeneClassification(),
             train_dataset=trainset_labeled,
-            eval_dataset=evalset_train_labeled
         )
         # train the gene classifier
         trainer.train()
         # save model
         trainer.save_model(ksplit_model_dir)
         # evaluate model
-        fpr, tpr, interp_tpr, conf_mat = classifier_predict(trainer.model, evalset_oos_labeled, 200, mean_fpr)
         # append to tpr and roc lists
         confusion = confusion + conf_mat
         all_tpr.append(interp_tpr)
         all_roc_auc.append(auc(fpr, tpr))
         # append number of eval examples by which to weight tpr in averaged graphs
         all_tpr_wt.append(len(tpr))
         iteration_num = iteration_num + 1
     # get overall metrics for cross-validation
-    mean_tpr, roc_auc, roc_auc_sd = get_cross_valid_metrics(all_tpr, all_roc_auc, all_tpr_wt)
     return all_roc_auc, roc_auc, roc_auc_sd, mean_fpr, mean_tpr, confusion, label_dicts
 # Computes metrics
 def compute_metrics(pred):
     labels = pred.label_ids
     preds = pred.predictions.argmax(-1)
     # calculate accuracy and macro f1 using sklearn's function
     acc = accuracy_score(labels, preds)
-    macro_f1 = f1_score(labels, preds, average='macro')
-    return {
-      'accuracy': acc,
-      'macro_f1': macro_f1
-    }
 # plot ROC curve
 def plot_ROC(bundled_data, title):
     plt.figure()
     lw = 2
     for roc_auc, roc_auc_sd, mean_fpr, mean_tpr, sample, color in bundled_data:
-        plt.plot(mean_fpr, mean_tpr, color=color,
-                 lw=lw, label="{0} (AUC {1:0.2f} $\pm$ {2:0.2f})".format(sample, roc_auc, roc_auc_sd))
-    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--')
     plt.xlim([0.0, 1.0])
     plt.ylim([0.0, 1.05])
-    plt.xlabel('False Positive Rate')
-    plt.ylabel('True Positive Rate')
     plt.title(title)
     plt.legend(loc="lower right")
     plt.savefig("ROC.png")
     return mean_fpr, mean_tpr, roc_auc
 # plot confusion matrix
 def plot_confusion_matrix(classes_list, conf_mat, title):
     display_labels = []
     i = 0
     for label in classes_list:
-        display_labels += ["{0}\nn={1:.0f}".format(label, sum(conf_mat[:,i]))]
         i = i + 1
-    display = ConfusionMatrixDisplay(confusion_matrix=preprocessing.normalize(conf_mat, norm="l1"),
-                                     display_labels=display_labels)
-    display.plot(cmap="Blues",values_format=".2g")
     plt.title(title)
     plt.savefig("CM.png")
 # Function to find the largest number smaller
 # than or equal to N that is divisible by k
 def find_largest_div(N, K):
     rem = N % K
-    if(rem == 0):
         return N
     else:
         return N - rem
 def preprocess_classifier_batch(cell_batch, max_len):
     if max_len == None:
         max_len = max([len(i) for i in cell_batch["input_ids"]])
     def pad_label_example(example):
-        example["labels"] = np.pad(example["labels"],
-                                   (0, max_len-len(example["input_ids"])),
-                                   mode='constant', constant_values=-100)
-        example["input_ids"] = np.pad(example["input_ids"],
-                                      (0, max_len-len(example["input_ids"])),
-                                      mode='constant', constant_values=token_dictionary.get("<pad>"))
-        example["attention_mask"] = (example["input_ids"] != token_dictionary.get("<pad>")).astype(int)
         return example
     padded_batch = cell_batch.map(pad_label_example)
     return padded_batch
 # forward batch size is batch size for model inference (e.g. 200)
 def classifier_predict(model, evalset, forward_batch_size, mean_fpr):
     predict_logits = []
     predict_labels = []
-    model.to('cpu')
     model.eval()
     # ensure there is at least 2 examples in each batch to avoid incorrect tensor dims
     evalset_len = len(evalset)
     max_divisible = find_largest_div(evalset_len, forward_batch_size)
     if len(evalset) - max_divisible == 1:
         evalset_len = max_divisible
     max_evalset_len = max(evalset.select([i for i in range(evalset_len)])["length"])
     for i in range(0, evalset_len, forward_batch_size):
-        max_range = min(i+forward_batch_size, evalset_len)
         batch_evalset = evalset.select([i for i in range(i, max_range)])
         padded_batch = preprocess_classifier_batch(batch_evalset, max_evalset_len)
         padded_batch.set_format(type="torch")
         input_data_batch = padded_batch["input_ids"]
         attn_msk_batch = padded_batch["attention_mask"]
         label_batch = padded_batch["labels"]
         with torch.no_grad():
             input_ids = input_data_batch
             attn_mask = attn_msk_batch
-            labels =  label_batch
             outputs = model(
-                input_ids = input_ids,
-                attention_mask = attn_mask,
-                labels = labels
             )
             predict_logits += [torch.squeeze(outputs.logits.to("cpu"))]
             predict_labels += [torch.squeeze(label_batch.to("cpu"))]
     logits_by_cell = torch.cat(predict_logits)
     all_logits = logits_by_cell.reshape(-1, logits_by_cell.shape[2])
     labels_by_cell = torch.cat(predict_labels)
     all_labels = torch.flatten(labels_by_cell)
-    logit_label_paired = [item for item in list(zip(all_logits.tolist(), all_labels.tolist())) if item[1]!=-100]
     y_pred = [vote(item[0]) for item in logit_label_paired]
     y_true = [item[1] for item in logit_label_paired]
     logits_list = [item[0] for item in logit_label_paired]
@@ -464,106 +559,133 @@ def classifier_predict(model, evalset, forward_batch_size, mean_fpr):
     plt.plot(fpr, tpr)
     plt.xlim([0.0, 1.0])
     plt.ylim([0.0, 1.05])
-    plt.xlabel('False Positive Rate')
-    plt.ylabel('True Positive Rate')
-    plt.title('ROC')
     plt.show()
     # interpolate to graph
     interp_tpr = np.interp(mean_fpr, fpr, tpr)
     interp_tpr[0] = 0.0
-    return fpr, tpr, interp_tpr, conf_mat
-def classify_genes(gene_info = "Genecorpus-30M/example_input_files/gene_info_table.csv", genes = "Genecorpus-30M/example_input_files/gene_classification/dosage_sensitive_tfs/dosage_sens_tf_labels.csv",
-       corpus_30M = "Genecorpus-30M/genecorpus_30M_2048.dataset/", model = '.',
-       max_input_size = 2 ** 11, max_lr = 5e-5, freeze_layers = 4, num_gpus = 1, num_proc = os.cpu_count(), geneformer_batch_size = 9, epochs = 1, filter_dataset = 50_000,
-       emb_extract = True, emb_layer = 0, forward_batch = 200, filter_data = None, inference = False, k_validate = True, model_location = "230917_geneformer_GeneClassifier_dosageTF_L2048_B12_LR5e-05_LSlinear_WU500_E1_Oadamw_n10000_F4/", skip_training = False, emb_dir = 'gene_emb', output_dir = None, max_cells = 1000, num_cpus = os.cpu_count()):
-    """"
     Primary Parameters
     -----------
     gene_info: path
         Path to gene mappings
     corpus_30M: path
         Path to 30M Gene Corpus
     model: path
         Path to pretrained GeneFormer model
     genes: path
         Path to csv file containing different columns of genes and the column labels
     inference: bool
         Whether the model should be used to run inference. If False, model will train with labeled data instead. Defaults to False
     k_validate: bool
         Whether the model should run k-fold validation or simply perform regular training/evaluate. Defaults to True
     skip_training: bool
         Whether the model should skip the training portion. Defaults to False
     emb_extract: bool
         WHether the model should extract embeddings for a given gene (WIP)
     Customization Parameters
     -----------
     freeze_layers: int
         Freezes x number of layers from the model. Default is 4 (2 non-frozen layers)
     filter_dataset: int
         Number of cells to filter from 30M dataset. Default is 50_000
     emb_layer: int
         What layer embeddings are extracted from. Default is 4
     filter_data: str, list
         Filters down embeddings to a single category. Default is None
     """
     # table of corresponding Ensembl IDs, gene names, and gene types (e.g. coding, miRNA, etc.)
     gene_info = pd.read_csv(gene_info, index_col=0)
     labels = gene_info.columns
     # create dictionaries for corresponding attributes
-    gene_id_type_dict = dict(zip(gene_info["ensembl_id"],gene_info["gene_type"]))
-    gene_name_id_dict = dict(zip(gene_info["gene_name"],gene_info["ensembl_id"]))
-    gene_id_name_dict = {v: k for k,v in gene_name_id_dict.items()}
     # function for preparing targets and labels
     def prep_inputs(label_store, id_type):
         target_list = []
         if id_type == "gene_name":
             for key in list(label_store.keys()):
-                targets = [gene_name_id_dict[gene] for gene in label_store[key] if gene_name_id_dict.get(gene) in token_dictionary]
                 targets_id = [token_dictionary[gene] for gene in targets]
                 target_list.append(targets_id)
         elif id_type == "ensembl_id":
             for key in list(label_store.keys()):
-                targets = [gene for gene in label_store[key] if gene in token_dictionary]
                 targets_id = [token_dictionary[gene] for gene in targets]
                 target_list.append(targets_id)
         targets, labels = [], []
         for targ in target_list:
             targets = targets + targ
         targets = np.array(targets)
         for num, targ in enumerate(target_list):
-            label = [num]*len(targ)
             labels = labels + label
         labels = np.array(labels)
         unique_labels = num + 1
-        nsplits = min(5, min([len(targ) for targ in target_list])-1)
         assert nsplits > 2
         return targets, labels, nsplits, unique_labels
     if skip_training == False:
         # preparing targets and labels for dosage sensitive vs insensitive TFs
         gene_classes = pd.read_csv(genes, header=0)
@@ -575,26 +697,26 @@ def classify_genes(gene_info = "Genecorpus-30M/example_input_files/gene_info_tab
             else:
                 labels = [filter_data]
         label_store = {}
         # Dictionary for decoding labels
-        decode = {i:labels[i] for i in range(len(labels))}
         for label in labels:
             label_store[label] = gene_classes[label].dropna()
         targets, labels, nsplits, unique_labels = prep_inputs(label_store, "ensembl_id")
         # load training dataset
-        train_dataset=load_from_disk(corpus_30M)
         shuffled_train_dataset = train_dataset.shuffle(seed=42)
-        subsampled_train_dataset = shuffled_train_dataset.select([i for i in range(filter_dataset)])
         lr_schedule_fn = "linear"
         warmup_steps = 500
         optimizer = "adamw"
         subsample_size = 10_000
         training_args = {
             "learning_rate": max_lr,
             "do_train": True,
@@ -611,52 +733,95 @@ def classify_genes(gene_info = "Genecorpus-30M/example_input_files/gene_info_tab
             "per_device_eval_batch_size": geneformer_batch_size,
             "num_train_epochs": epochs,
         }
         # define output directory path
         current_date = datetime.datetime.now()
         datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
         if output_dir == None:
-            training_output_dir = Path(f"{datestamp}_geneformer_GeneClassifier_dosageTF_L{max_input_size}_B{geneformer_batch_size}_LR{max_lr}_LS{lr_schedule_fn}_WU{warmup_steps}_E{epochs}_O{optimizer}_n{subsample_size}_F{freeze_layers}/")
         else:
             training_output_dir = Path(output_dir)
         # make output directory
-        subprocess.call(f'mkdir -p {training_output_dir}', shell=True)
         # Places number of classes +  in directory
         num_classes = len(set(labels))
         info_list = [num_classes, decode]
-        with open(training_output_dir / 'classes.txt', 'w') as f:
             f.write(str(info_list))
-        subsampled_train_dataset.save_to_disk(output_dir / 'dataset')
         if k_validate == True:
-            ksplit_model ="ksplit0/models"
             ksplit_model_test = os.path.join(training_output_dir, ksplit_model)
-            #if os.path.isfile(ksplit_model_test) == True:
             #    raise Exception("Model already saved to this directory.")
-                # cross-validate gene classifier
-            all_roc_auc, roc_auc, roc_auc_sd, mean_fpr, mean_tpr, confusion, label_dicts = cross_validate(subsampled_train_dataset, targets, labels, nsplits, subsample_size, training_args, freeze_layers, training_output_dir, 1, unique_labels, model)
             bundled_data = []
-            bundled_data += [(roc_auc, roc_auc_sd, mean_fpr, mean_tpr, "Geneformer", "red")]
-            graph_title = " ".join([i + ' vs' if count < len(label_store) - 1 else i for count, i in enumerate(label_store)])
-            fpr, tpr, auc = plot_ROC(bundled_data, 'Dosage Sensitive vs Insensitive TFs')
             print(auc)
             # plot confusion matrix
             plot_confusion_matrix(label_store, confusion, "Geneformer")
-        else:
-            fpr, tpr, auc  = validate(subsampled_train_dataset, targets, labels, nsplits, subsample_size, training_args, freeze_layers, training_output_dir, 1, unique_labels, model)
             print(auc)
     if inference == True:
         # preparing targets and labels for dosage sensitive vs insensitive TFs
         gene_classes = pd.read_csv(genes, header=0)
         targets = []
-        for column in gene_classes.columns:
             targets += list(gene_classes[column])
         tokens = []
         for target in targets:
@@ -664,47 +829,48 @@ def classify_genes(gene_info = "Genecorpus-30M/example_input_files/gene_info_tab
                 tokens.append(token_dictionary[target])
             except:
                 tokens.append(0)
         targets = torch.LongTensor([tokens])
-        with open(f'{model_location}classes.txt', 'r') as f:
             info_list = ast.literal_eval(f.read())
         num_classes = info_list[0]
         labels = info_list[1]
         model = BertForTokenClassification.from_pretrained(
             model_location,
             num_labels=num_classes,
-            output_attentions = False,
-            output_hidden_states = False,
-            local_files_only = True
         )
         if freeze_layers is not None:
             modules_to_freeze = model.bert.encoder.layer[:freeze_layers]
             for module in modules_to_freeze:
                 for param in module.parameters():
                     param.requires_grad = False
         model = model.to(device)
         # evaluate model
-        predictions = F.softmax(model(targets.to(device))["logits"], dim = -1).argmax(-1)[0]
         predictions = [labels[int(pred)] for pred in predictions]
         return predictions
-    # Extracts aggregate gene embeddings for each label
     if emb_extract == True:
-        with open(f'{model_location}/classes.txt', 'r') as f:
             data = ast.literal_eval(f.read())
         num_classes = data[0]
         decode = data[1]
         gene_classes = pd.read_csv(genes, header=0)
         labels = gene_classes.columns
         tokenize = TranscriptomeTokenizer()
         label_dict = {}
         for label in labels:
             genes = gene_classes[label]
@@ -715,32 +881,55 @@ def classify_genes(gene_info = "Genecorpus-30M/example_input_files/gene_info_tab
                 except:
                     continue
             label_dict[label] = tokenized_genes
-        embex = EmbExtractor(model_type="GeneClassifier", num_classes=num_classes, emb_mode = "gene",
-                             filter_data=None, max_ncells=max_cells, emb_layer=emb_layer,
-                             emb_label=label_dict, labels_to_plot=list(labels), forward_batch_size=forward_batch, nproc=num_cpus)
-        subprocess.call(f'mkdir -p {emb_dir}', shell = True)
-        embs = embex.extract_embs(model_directory = model_location, input_data_file = model_location / 'dataset', output_directory = emb_dir, output_prefix = f"{label}_embbeddings")
-        emb_dict = {label:[] for label in list(set(labels))}
-        similarities = {key:{} for key in list(emb_dict.keys())}
         for column in embs.columns:
             remaining_cols = [k for k in embs.columns if k != column]
             for k in remaining_cols:
                 embedding = torch.Tensor(embs[k])
-                sim = similarity(torch.Tensor(embs[column]), embedding, cosine = True)
                 similarities[column][k] = sim
         plot_similarity_heatmap(similarities)
         print(similarities)
         return similarities
-if __name__ == '__main__':
-    classify_genes(k_validate = False, inference = False, skip_training = False, emb_extract = True, output_dir = Path('gene_emb'), model_location = Path('gene_emb'), epochs = 5, gene_info = "../GeneFormer_repo/Genecorpus-30M/example_input_files/gene_info_table.csv", genes = "../GeneFormer_repo/Genecorpus-30M/example_input_files/gene_classification/dosage_sensitive_tfs/dosage_sens_tf_labels.csv", corpus_30M = "../GeneFormer_repo/Genecorpus-30M/genecorpus_30M_2048.dataset/")

 import os
 import sys
+GPU_NUMBER = [0]  # CHANGE WITH MULTIGPU
 os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(s) for s in GPU_NUMBER])
 os.environ["NCCL_DEBUG"] = "INFO"
+import ast
 import datetime
+import math
+import pickle
 import subprocess
 from pathlib import Path
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import torch
+import torch.nn.functional as F
+from datasets import Dataset, load_from_disk
+from sklearn import preprocessing
+from sklearn.metrics import (
+    ConfusionMatrixDisplay,
+    accuracy_score,
+    auc,
+    confusion_matrix,
+    roc_auc_score,
+    roc_curve,
+)
+# imports
+from sklearn.model_selection import StratifiedKFold, train_test_split
 from tqdm.notebook import tqdm
+from transformers import BertForTokenClassification, Trainer
+from transformers.training_args import TrainingArguments
 from geneformer import DataCollatorForGeneClassification, EmbExtractor
 from geneformer.pretrainer import token_dictionary
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 from geneformer import TranscriptomeTokenizer
+def vote(logit_pair):
     a, b = logit_pair
     if a > b:
         return 0
         return 1
     elif a == b:
         return "tie"
 def py_softmax(vector):
+    e = np.exp(vector)
+    return e / e.sum()
+    # Identifies cosine similarity between two embeddings. 0 is perfectly dissimilar and 1 is perfectly similar
+def similarity(tensor1, tensor2, cosine=True):
     if cosine == False:
         if tensor1.ndimension() > 1:
             tensor1 = tensor1.view(1, -1)
         norm_tensor2 = torch.norm(tensor2)
         epsilon = 1e-8
         similarity = dot_product / (norm_tensor1 * norm_tensor2 + epsilon)
+        similarity = (similarity.item() + 1) / 2
     else:
         if tensor1.shape != tensor2.shape:
             raise ValueError("Input tensors must have the same shape.")
         dot_product = torch.dot(tensor1, tensor2)
         norm_tensor1 = torch.norm(tensor1)
         norm_tensor2 = torch.norm(tensor2)
         # Avoid division by zero by adding a small epsilon
         epsilon = 1e-8
         similarity = dot_product / (norm_tensor1 * norm_tensor2 + epsilon)
     return similarity.item()
 # Plots heatmap between different classes/labels
 def plot_similarity_heatmap(similarities):
     classes = list(similarities.keys())
             else:
                 val = similarities[c][cc]
             arr[i][j] = val
     plt.figure(figsize=(8, 6))
+    plt.imshow(arr, cmap="inferno", vmin=0, vmax=1)
     plt.colorbar()
+    plt.xticks(np.arange(classlen), classes, rotation=45, ha="right")
     plt.yticks(np.arange(classlen), classes)
     plt.title("Similarity Heatmap")
     plt.savefig("similarity_heatmap.png")
 # get cross-validated mean and sd metrics
 def get_cross_valid_metrics(all_tpr, all_roc_auc, all_tpr_wt):
+    wts = [count / sum(all_tpr_wt) for count in all_tpr_wt]
+    all_weighted_tpr = [a * b for a, b in zip(all_tpr, wts)]
     mean_tpr = np.sum(all_weighted_tpr, axis=0)
     mean_tpr[-1] = 1.0
+    all_weighted_roc_auc = [a * b for a, b in zip(all_roc_auc, wts)]
     roc_auc = np.sum(all_weighted_roc_auc)
+    roc_auc_sd = math.sqrt(np.average((all_roc_auc - roc_auc) ** 2, weights=wts))
     return mean_tpr, roc_auc, roc_auc_sd
+def validate(
+    data,
+    targets,
+    labels,
+    nsplits,
+    subsample_size,
+    training_args,
+    freeze_layers,
+    output_dir,
+    num_proc,
+    num_labels,
+    pre_model,
+):
     # initiate eval metrics to return
     num_classes = len(set(labels))
     mean_fpr = np.linspace(0, 1, 100)
     # create 80/20 train/eval splits
+    targets_train, targets_eval, labels_train, labels_eval = train_test_split(
+        targets, labels, test_size=0.25, shuffle=True
+    )
     label_dict_train = dict(zip(targets_train, labels_train))
     label_dict_eval = dict(zip(targets_eval, labels_eval))
     # function to filter by whether contains train or eval labels
     def if_contains_train_label(example):
         a = label_dict_train.keys()
+        b = example["input_ids"]
         return not set(a).isdisjoint(b)
     def if_contains_eval_label(example):
         a = label_dict_eval.keys()
+        b = example["input_ids"]
         return not set(a).isdisjoint(b)
     # filter dataset for examples containing classes for this split
     print(f"Filtering training data")
     trainset = data.filter(if_contains_train_label, num_proc=num_proc)
+    print(
+        f"Filtered {round((1-len(trainset)/len(data))*100)}%; {len(trainset)} remain\n"
+    )
     print(f"Filtering evalation data")
     evalset = data.filter(if_contains_eval_label, num_proc=num_proc)
     print(f"Filtered {round((1-len(evalset)/len(data))*100)}%; {len(evalset)} remain\n")
     # minimize to smaller training sample
     training_size = min(subsample_size, len(trainset))
     trainset_min = trainset.select([i for i in range(training_size)])
     eval_size = min(training_size, len(evalset))
+    half_training_size = round(eval_size / 2)
     evalset_train_min = evalset.select([i for i in range(half_training_size)])
     evalset_oos_min = evalset.select([i for i in range(half_training_size, eval_size)])
     # label conversion functions
     def generate_train_labels(example):
+        example["labels"] = [
+            label_dict_train.get(token_id, -100) for token_id in example["input_ids"]
+        ]
         return example
     def generate_eval_labels(example):
+        example["labels"] = [
+            label_dict_eval.get(token_id, -100) for token_id in example["input_ids"]
+        ]
         return example
+    # label datasets
     print(f"Labeling training data")
     trainset_labeled = trainset_min.map(generate_train_labels)
     print(f"Labeling evaluation data")
     evalset_train_labeled = evalset_train_min.map(generate_eval_labels)
     print(f"Labeling evaluation OOS data")
     evalset_oos_labeled = evalset_oos_min.map(generate_eval_labels)
     # load model
     model = BertForTokenClassification.from_pretrained(
+        pre_model,
+        num_labels=num_labels,
+        output_attentions=False,
+        output_hidden_states=False,
     )
     if freeze_layers is not None:
         modules_to_freeze = model.bert.encoder.layer[:freeze_layers]
         for module in modules_to_freeze:
             for param in module.parameters():
                 param.requires_grad = False
     model = model.to(device)
     # add output directory to training args and initiate
     training_args["output_dir"] = output_dir
     training_args_init = TrainingArguments(**training_args)
     # create the trainer
     trainer = Trainer(
         model=model,
         train_dataset=trainset_labeled,
         eval_dataset=evalset_train_labeled,
     )
     # train the gene classifier
     trainer.train()
     trainer.save_model(output_dir)
+    fpr, tpr, interp_tpr, conf_mat = classifier_predict(
+        trainer.model, evalset_oos_labeled, 200, mean_fpr
+    )
     auc_score = auc(fpr, tpr)
     return fpr, tpr, auc_score
 # cross-validate gene classifier
+def cross_validate(
+    data,
+    targets,
+    labels,
+    nsplits,
+    subsample_size,
+    training_args,
+    freeze_layers,
+    output_dir,
+    num_proc,
+    num_labels,
+    pre_model,
+):
     # check if output directory already written to
     # ensure not overwriting previously saved model
     model_dir_test = os.path.join(output_dir, "ksplit0/models/pytorch_model.bin")
+    # if os.path.isfile(model_dir_test) == True:
     #    raise Exception("Model already saved to this directory.")
+    device = device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     # initiate eval metrics to return
     num_classes = len(set(labels))
     mean_fpr = np.linspace(0, 1, 100)
     all_roc_auc = []
     all_tpr_wt = []
     label_dicts = []
+    confusion = np.zeros((num_classes, num_classes))
     # set up cross-validation splits
     skf = StratifiedKFold(n_splits=nsplits, random_state=0, shuffle=True)
     # train and evaluate
             print("early stopping activated due to large # of training examples")
             if iteration_num == 3:
                 break
         print(f"****** Crossval split: {iteration_num}/{nsplits-1} ******\n")
         # generate cross-validation splits
         targets_train, targets_eval = targets[train_index], targets[eval_index]
         labels_train, labels_eval = labels[train_index], labels[eval_index]
         label_dict_train = dict(zip(targets_train, labels_train))
         label_dict_eval = dict(zip(targets_eval, labels_eval))
+        label_dicts += (
+            iteration_num,
+            targets_train,
+            targets_eval,
+            labels_train,
+            labels_eval,
+        )
         # function to filter by whether contains train or eval labels
         def if_contains_train_label(example):
             a = label_dict_train.keys()
+            b = example["input_ids"]
             return not set(a).isdisjoint(b)
         def if_contains_eval_label(example):
             a = label_dict_eval.keys()
+            b = example["input_ids"]
             return not set(a).isdisjoint(b)
         # filter dataset for examples containing classes for this split
         print(f"Filtering training data")
         trainset = data.filter(if_contains_train_label, num_proc=num_proc)
+        print(
+            f"Filtered {round((1-len(trainset)/len(data))*100)}%; {len(trainset)} remain\n"
+        )
         print(f"Filtering evalation data")
         evalset = data.filter(if_contains_eval_label, num_proc=num_proc)
+        print(
+            f"Filtered {round((1-len(evalset)/len(data))*100)}%; {len(evalset)} remain\n"
+        )
         # minimize to smaller training sample
         training_size = min(subsample_size, len(trainset))
         trainset_min = trainset.select([i for i in range(training_size)])
         eval_size = min(training_size, len(evalset))
+        half_training_size = round(eval_size / 2)
         evalset_train_min = evalset.select([i for i in range(half_training_size)])
+        evalset_oos_min = evalset.select(
+            [i for i in range(half_training_size, eval_size)]
+        )
         # label conversion functions
         def generate_train_labels(example):
+            example["labels"] = [
+                label_dict_train.get(token_id, -100)
+                for token_id in example["input_ids"]
+            ]
             return example
         def generate_eval_labels(example):
+            example["labels"] = [
+                label_dict_eval.get(token_id, -100) for token_id in example["input_ids"]
+            ]
             return example
+        # label datasets
         print(f"Labeling training data")
         trainset_labeled = trainset_min.map(generate_train_labels)
         print(f"Labeling evaluation data")
         evalset_train_labeled = evalset_train_min.map(generate_eval_labels)
         print(f"Labeling evaluation OOS data")
         evalset_oos_labeled = evalset_oos_min.map(generate_eval_labels)
         # create output directories
         ksplit_output_dir = os.path.join(output_dir, f"ksplit{iteration_num}")
+        ksplit_model_dir = os.path.join(ksplit_output_dir, "models/")
         # ensure not overwriting previously saved model
         model_output_file = os.path.join(ksplit_model_dir, "pytorch_model.bin")
+        # if os.path.isfile(model_output_file) == True:
         #    raise Exception("Model already saved to this directory.")
         # make training and model output directories
+        subprocess.call(f"mkdir -p {ksplit_output_dir}", shell=True)
+        subprocess.call(f"mkdir -p {ksplit_model_dir}", shell=True)
         # load model
         model = BertForTokenClassification.from_pretrained(
             pre_model,
             num_labels=num_labels,
+            output_attentions=False,
+            output_hidden_states=False,
         )
         if freeze_layers is not None:
             modules_to_freeze = model.bert.encoder.layer[:freeze_layers]
             for module in modules_to_freeze:
                 for param in module.parameters():
                     param.requires_grad = False
         model = model.to(device)
         # add output directory to training args and initiate
         training_args["output_dir"] = ksplit_output_dir
         training_args_init = TrainingArguments(**training_args)
         # create the trainer
         trainer = Trainer(
             model=model,
             args=training_args_init,
             data_collator=DataCollatorForGeneClassification(),
             train_dataset=trainset_labeled,
+            eval_dataset=evalset_train_labeled,
         )
         # train the gene classifier
         trainer.train()
         # save model
         trainer.save_model(ksplit_model_dir)
         # evaluate model
+        fpr, tpr, interp_tpr, conf_mat = classifier_predict(
+            trainer.model, evalset_oos_labeled, 200, mean_fpr
+        )
         # append to tpr and roc lists
         confusion = confusion + conf_mat
         all_tpr.append(interp_tpr)
         all_roc_auc.append(auc(fpr, tpr))
         # append number of eval examples by which to weight tpr in averaged graphs
         all_tpr_wt.append(len(tpr))
         iteration_num = iteration_num + 1
     # get overall metrics for cross-validation
+    mean_tpr, roc_auc, roc_auc_sd = get_cross_valid_metrics(
+        all_tpr, all_roc_auc, all_tpr_wt
+    )
     return all_roc_auc, roc_auc, roc_auc_sd, mean_fpr, mean_tpr, confusion, label_dicts
 # Computes metrics
 def compute_metrics(pred):
     labels = pred.label_ids
     preds = pred.predictions.argmax(-1)
     # calculate accuracy and macro f1 using sklearn's function
     acc = accuracy_score(labels, preds)
+    macro_f1 = f1_score(labels, preds, average="macro")
+    return {"accuracy": acc, "macro_f1": macro_f1}
 # plot ROC curve
 def plot_ROC(bundled_data, title):
     plt.figure()
     lw = 2
     for roc_auc, roc_auc_sd, mean_fpr, mean_tpr, sample, color in bundled_data:
+        plt.plot(
+            mean_fpr,
+            mean_tpr,
+            color=color,
+            lw=lw,
+            label="{0} (AUC {1:0.2f} $\pm$ {2:0.2f})".format(
+                sample, roc_auc, roc_auc_sd
+            ),
+        )
+    plt.plot([0, 1], [0, 1], color="black", lw=lw, linestyle="--")
     plt.xlim([0.0, 1.0])
     plt.ylim([0.0, 1.05])
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
     plt.title(title)
     plt.legend(loc="lower right")
     plt.savefig("ROC.png")
     return mean_fpr, mean_tpr, roc_auc
 # plot confusion matrix
 def plot_confusion_matrix(classes_list, conf_mat, title):
     display_labels = []
     i = 0
     for label in classes_list:
+        display_labels += ["{0}\nn={1:.0f}".format(label, sum(conf_mat[:, i]))]
         i = i + 1
+    display = ConfusionMatrixDisplay(
+        confusion_matrix=preprocessing.normalize(conf_mat, norm="l1"),
+        display_labels=display_labels,
+    )
+    display.plot(cmap="Blues", values_format=".2g")
     plt.title(title)
     plt.savefig("CM.png")
 # Function to find the largest number smaller
 # than or equal to N that is divisible by k
 def find_largest_div(N, K):
     rem = N % K
+    if rem == 0:
         return N
     else:
         return N - rem
 def preprocess_classifier_batch(cell_batch, max_len):
     if max_len == None:
         max_len = max([len(i) for i in cell_batch["input_ids"]])
     def pad_label_example(example):
+        example["labels"] = np.pad(
+            example["labels"],
+            (0, max_len - len(example["input_ids"])),
+            mode="constant",
+            constant_values=-100,
+        )
+        example["input_ids"] = np.pad(
+            example["input_ids"],
+            (0, max_len - len(example["input_ids"])),
+            mode="constant",
+            constant_values=token_dictionary.get("<pad>"),
+        )
+        example["attention_mask"] = (
+            example["input_ids"] != token_dictionary.get("<pad>")
+        ).astype(int)
         return example
     padded_batch = cell_batch.map(pad_label_example)
     return padded_batch
 # forward batch size is batch size for model inference (e.g. 200)
 def classifier_predict(model, evalset, forward_batch_size, mean_fpr):
     predict_logits = []
     predict_labels = []
+    model.to("cpu")
     model.eval()
     # ensure there is at least 2 examples in each batch to avoid incorrect tensor dims
     evalset_len = len(evalset)
     max_divisible = find_largest_div(evalset_len, forward_batch_size)
     if len(evalset) - max_divisible == 1:
         evalset_len = max_divisible
     max_evalset_len = max(evalset.select([i for i in range(evalset_len)])["length"])
     for i in range(0, evalset_len, forward_batch_size):
+        max_range = min(i + forward_batch_size, evalset_len)
         batch_evalset = evalset.select([i for i in range(i, max_range)])
         padded_batch = preprocess_classifier_batch(batch_evalset, max_evalset_len)
         padded_batch.set_format(type="torch")
         input_data_batch = padded_batch["input_ids"]
         attn_msk_batch = padded_batch["attention_mask"]
         label_batch = padded_batch["labels"]
         with torch.no_grad():
             input_ids = input_data_batch
             attn_mask = attn_msk_batch
+            labels = label_batch
             outputs = model(
+                input_ids=input_ids, attention_mask=attn_mask, labels=labels
             )
             predict_logits += [torch.squeeze(outputs.logits.to("cpu"))]
             predict_labels += [torch.squeeze(label_batch.to("cpu"))]
     logits_by_cell = torch.cat(predict_logits)
     all_logits = logits_by_cell.reshape(-1, logits_by_cell.shape[2])
     labels_by_cell = torch.cat(predict_labels)
     all_labels = torch.flatten(labels_by_cell)
+    logit_label_paired = [
+        item
+        for item in list(zip(all_logits.tolist(), all_labels.tolist()))
+        if item[1] != -100
+    ]
     y_pred = [vote(item[0]) for item in logit_label_paired]
     y_true = [item[1] for item in logit_label_paired]
     logits_list = [item[0] for item in logit_label_paired]
     plt.plot(fpr, tpr)
     plt.xlim([0.0, 1.0])
     plt.ylim([0.0, 1.05])
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.title("ROC")
     plt.show()
     # interpolate to graph
     interp_tpr = np.interp(mean_fpr, fpr, tpr)
     interp_tpr[0] = 0.0
+    return fpr, tpr, interp_tpr, conf_mat
+def classify_genes(
+    gene_info="Genecorpus-30M/example_input_files/gene_info_table.csv",
+    genes="Genecorpus-30M/example_input_files/gene_classification/dosage_sensitive_tfs/dosage_sens_tf_labels.csv",
+    corpus_30M="Genecorpus-30M/genecorpus_30M_2048.dataset/",
+    model=".",
+    max_input_size=2**11,
+    max_lr=5e-5,
+    freeze_layers=4,
+    num_gpus=1,
+    num_proc=os.cpu_count(),
+    geneformer_batch_size=9,
+    epochs=1,
+    filter_dataset=50_000,
+    emb_extract=True,
+    emb_layer=0,
+    forward_batch=200,
+    filter_data=None,
+    inference=False,
+    k_validate=True,
+    model_location="230917_geneformer_GeneClassifier_dosageTF_L2048_B12_LR5e-05_LSlinear_WU500_E1_Oadamw_n10000_F4/",
+    skip_training=False,
+    emb_dir="gene_emb",
+    output_dir=None,
+    max_cells=1000,
+    num_cpus=os.cpu_count(),
+):
+    """ "
     Primary Parameters
     -----------
     gene_info: path
         Path to gene mappings
     corpus_30M: path
         Path to 30M Gene Corpus
     model: path
         Path to pretrained GeneFormer model
     genes: path
         Path to csv file containing different columns of genes and the column labels
     inference: bool
         Whether the model should be used to run inference. If False, model will train with labeled data instead. Defaults to False
     k_validate: bool
         Whether the model should run k-fold validation or simply perform regular training/evaluate. Defaults to True
     skip_training: bool
         Whether the model should skip the training portion. Defaults to False
     emb_extract: bool
         WHether the model should extract embeddings for a given gene (WIP)
     Customization Parameters
     -----------
     freeze_layers: int
         Freezes x number of layers from the model. Default is 4 (2 non-frozen layers)
     filter_dataset: int
         Number of cells to filter from 30M dataset. Default is 50_000
     emb_layer: int
         What layer embeddings are extracted from. Default is 4
     filter_data: str, list
         Filters down embeddings to a single category. Default is None
     """
     # table of corresponding Ensembl IDs, gene names, and gene types (e.g. coding, miRNA, etc.)
     gene_info = pd.read_csv(gene_info, index_col=0)
     labels = gene_info.columns
     # create dictionaries for corresponding attributes
+    gene_id_type_dict = dict(zip(gene_info["ensembl_id"], gene_info["gene_type"]))
+    gene_name_id_dict = dict(zip(gene_info["gene_name"], gene_info["ensembl_id"]))
+    gene_id_name_dict = {v: k for k, v in gene_name_id_dict.items()}
     # function for preparing targets and labels
     def prep_inputs(label_store, id_type):
         target_list = []
         if id_type == "gene_name":
             for key in list(label_store.keys()):
+                targets = [
+                    gene_name_id_dict[gene]
+                    for gene in label_store[key]
+                    if gene_name_id_dict.get(gene) in token_dictionary
+                ]
                 targets_id = [token_dictionary[gene] for gene in targets]
                 target_list.append(targets_id)
         elif id_type == "ensembl_id":
             for key in list(label_store.keys()):
+                targets = [
+                    gene for gene in label_store[key] if gene in token_dictionary
+                ]
                 targets_id = [token_dictionary[gene] for gene in targets]
                 target_list.append(targets_id)
         targets, labels = [], []
         for targ in target_list:
             targets = targets + targ
         targets = np.array(targets)
         for num, targ in enumerate(target_list):
+            label = [num] * len(targ)
             labels = labels + label
         labels = np.array(labels)
         unique_labels = num + 1
+        nsplits = min(5, min([len(targ) for targ in target_list]) - 1)
         assert nsplits > 2
         return targets, labels, nsplits, unique_labels
     if skip_training == False:
         # preparing targets and labels for dosage sensitive vs insensitive TFs
         gene_classes = pd.read_csv(genes, header=0)
             else:
                 labels = [filter_data]
         label_store = {}
         # Dictionary for decoding labels
+        decode = {i: labels[i] for i in range(len(labels))}
         for label in labels:
             label_store[label] = gene_classes[label].dropna()
         targets, labels, nsplits, unique_labels = prep_inputs(label_store, "ensembl_id")
         # load training dataset
+        train_dataset = load_from_disk(corpus_30M)
         shuffled_train_dataset = train_dataset.shuffle(seed=42)
+        subsampled_train_dataset = shuffled_train_dataset.select(
+            [i for i in range(filter_dataset)]
+        )
         lr_schedule_fn = "linear"
         warmup_steps = 500
         optimizer = "adamw"
         subsample_size = 10_000
         training_args = {
             "learning_rate": max_lr,
             "do_train": True,
             "per_device_eval_batch_size": geneformer_batch_size,
             "num_train_epochs": epochs,
         }
         # define output directory path
         current_date = datetime.datetime.now()
         datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
         if output_dir == None:
+            training_output_dir = Path(
+                f"{datestamp}_geneformer_GeneClassifier_dosageTF_L{max_input_size}_B{geneformer_batch_size}_LR{max_lr}_LS{lr_schedule_fn}_WU{warmup_steps}_E{epochs}_O{optimizer}_n{subsample_size}_F{freeze_layers}/"
+            )
         else:
             training_output_dir = Path(output_dir)
         # make output directory
+        subprocess.call(f"mkdir -p {training_output_dir}", shell=True)
         # Places number of classes +  in directory
         num_classes = len(set(labels))
         info_list = [num_classes, decode]
+        with open(training_output_dir / "classes.txt", "w") as f:
             f.write(str(info_list))
+        subsampled_train_dataset.save_to_disk(output_dir / "dataset")
         if k_validate == True:
+            ksplit_model = "ksplit0/models"
             ksplit_model_test = os.path.join(training_output_dir, ksplit_model)
+            # if os.path.isfile(ksplit_model_test) == True:
             #    raise Exception("Model already saved to this directory.")
+            # cross-validate gene classifier
+            (
+                all_roc_auc,
+                roc_auc,
+                roc_auc_sd,
+                mean_fpr,
+                mean_tpr,
+                confusion,
+                label_dicts,
+            ) = cross_validate(
+                subsampled_train_dataset,
+                targets,
+                labels,
+                nsplits,
+                subsample_size,
+                training_args,
+                freeze_layers,
+                training_output_dir,
+                1,
+                unique_labels,
+                model,
+            )
             bundled_data = []
+            bundled_data += [
+                (roc_auc, roc_auc_sd, mean_fpr, mean_tpr, "Geneformer", "red")
+            ]
+            graph_title = " ".join(
+                [
+                    i + " vs" if count < len(label_store) - 1 else i
+                    for count, i in enumerate(label_store)
+                ]
+            )
+            fpr, tpr, auc = plot_ROC(
+                bundled_data, "Dosage Sensitive vs Insensitive TFs"
+            )
             print(auc)
             # plot confusion matrix
             plot_confusion_matrix(label_store, confusion, "Geneformer")
+        else:
+            fpr, tpr, auc = validate(
+                subsampled_train_dataset,
+                targets,
+                labels,
+                nsplits,
+                subsample_size,
+                training_args,
+                freeze_layers,
+                training_output_dir,
+                1,
+                unique_labels,
+                model,
+            )
             print(auc)
     if inference == True:
         # preparing targets and labels for dosage sensitive vs insensitive TFs
         gene_classes = pd.read_csv(genes, header=0)
         targets = []
+        for column in gene_classes.columns:
             targets += list(gene_classes[column])
         tokens = []
         for target in targets:
                 tokens.append(token_dictionary[target])
             except:
                 tokens.append(0)
         targets = torch.LongTensor([tokens])
+        with open(f"{model_location}classes.txt", "r") as f:
             info_list = ast.literal_eval(f.read())
         num_classes = info_list[0]
         labels = info_list[1]
         model = BertForTokenClassification.from_pretrained(
             model_location,
             num_labels=num_classes,
+            output_attentions=False,
+            output_hidden_states=False,
+            local_files_only=True,
         )
         if freeze_layers is not None:
             modules_to_freeze = model.bert.encoder.layer[:freeze_layers]
             for module in modules_to_freeze:
                 for param in module.parameters():
                     param.requires_grad = False
         model = model.to(device)
         # evaluate model
+        predictions = F.softmax(model(targets.to(device))["logits"], dim=-1).argmax(-1)[
+            0
+        ]
         predictions = [labels[int(pred)] for pred in predictions]
         return predictions
+    # Extracts aggregate gene embeddings for each label
     if emb_extract == True:
+        with open(f"{model_location}/classes.txt", "r") as f:
             data = ast.literal_eval(f.read())
         num_classes = data[0]
         decode = data[1]
         gene_classes = pd.read_csv(genes, header=0)
         labels = gene_classes.columns
         tokenize = TranscriptomeTokenizer()
         label_dict = {}
         for label in labels:
             genes = gene_classes[label]
                 except:
                     continue
             label_dict[label] = tokenized_genes
+        embex = EmbExtractor(
+            model_type="GeneClassifier",
+            num_classes=num_classes,
+            emb_mode="gene",
+            filter_data=None,
+            max_ncells=max_cells,
+            emb_layer=emb_layer,
+            emb_label=label_dict,
+            labels_to_plot=list(labels),
+            forward_batch_size=forward_batch,
+            nproc=num_cpus,
+        )
+        subprocess.call(f"mkdir -p {emb_dir}", shell=True)
+        embs = embex.extract_embs(
+            model_directory=model_location,
+            input_data_file=model_location / "dataset",
+            output_directory=emb_dir,
+            output_prefix=f"{label}_embbeddings",
+        )
+        emb_dict = {label: [] for label in list(set(labels))}
+        similarities = {key: {} for key in list(emb_dict.keys())}
         for column in embs.columns:
             remaining_cols = [k for k in embs.columns if k != column]
             for k in remaining_cols:
                 embedding = torch.Tensor(embs[k])
+                sim = similarity(torch.Tensor(embs[column]), embedding, cosine=True)
                 similarities[column][k] = sim
         plot_similarity_heatmap(similarities)
         print(similarities)
         return similarities
+if __name__ == "__main__":
+    classify_genes(
+        k_validate=False,
+        inference=False,
+        skip_training=False,
+        emb_extract=True,
+        output_dir=Path("gene_emb"),
+        model_location=Path("gene_emb"),
+        epochs=5,
+        gene_info="../GeneFormer_repo/Genecorpus-30M/example_input_files/gene_info_table.csv",
+        genes="../GeneFormer_repo/Genecorpus-30M/example_input_files/gene_classification/dosage_sensitive_tfs/dosage_sens_tf_labels.csv",
+        corpus_30M="../GeneFormer_repo/Genecorpus-30M/genecorpus_30M_2048.dataset/",
+    )

Modular_usage.md → geneformer/modular_classifier_usage.md RENAMED Viewed

@@ -1,33 +1,33 @@
 # Cell classifier
 def finetune_cells(token_set = Path('geneformer/token_dictionary.pkl'), median_set = Path('geneformer/gene_median_dictionary.pkl'), pretrained_model = ".",
- dataset = 'Genecorpus-30M/example_input_files/cell_classification/cell_type_annotation/cell_type_train_data.dataset/',
  dataset_split = None,
-  filter_cells = .005,
-  epochs = 1,
-  cpu_cores = os.cpu_count(),
-  geneformer_batch_size = 12,
-  optimizer = 'adamw',
-  max_lr = 5e-5,
-  num_gpus = torch.cuda.device_count(),
   max_input_size = 2 ** 11,
-  lr_schedule_fn = "linear",
-  warmup_steps = 500,
-  freeze_layers = 0,
-  emb_extract = False,
-  max_cells = 1000,
-  emb_layer = 0,
-  emb_filter = None,
-  emb_dir = 'embeddings',
   overwrite = True,
   label = "cell_type",
   data_filter = None,
-  forward_batch = 200, model_location = None,
-  skip_training = False,
   sample_data = 1,
-   inference = False,
-   optimize_hyperparameters = False,
    output_dir = None):
     '''
     Primary Parameters
     -------------------
@@ -36,121 +36,121 @@ def finetune_cells(token_set = Path('geneformer/token_dictionary.pkl'), median_s
     model_location: path
         Path to location of existing model to use for inference and embedding extraction
     pretrained_model: path
         Path to pretrained GeneFormer 30M model before fine-tuning
-    inference: bool
         Chooses whether to perform inference (which causes the function to return the list of similarities). Defaults to False
-    skip_training: bool
         Chooses whether to skip training the model. Defaults to False
     emb_extract: bool
         Choose whether to extract embeddings and calculate similarities. Defaults to True
     optimize_hyperparameters: bool
         Choose whether to optimize model hyperparamters. Defaults to False
     label: string
-		The label string in the formatted dataset that contains true class labels. Defaults to "label"
     Customization Parameters
     -------------------
     dataset_split: str
         How the dataset should be partitioned (if at all), and what ID should be used for partitioning
     data_filter: list
         (For embeddings and inference) Runs analysis subsets of the dataset by the ID defined by dataset_split
     label: str
         What feature should be read as a classification label
     emb_layer: int
         What layer embeddings should be extracted and compared from.
     emb_filter: ['cell1', 'cell2'...]
         Allows user to narrow down range of cells that embeddings will be extracted from.
     max_cells: int
-        How many embeddings from cells should be extracted.
     freeze_layers: int
         Number of layers should be permanently frozen during fine-tuning (starting from the first layer, 4 brings it up to the pretrained model).
     sample_data: float
         What proportion of the HF dataset should be used
     '''
    # Gene Classifier
-   def classify_genes(gene_info = "Genecorpus-30M/example_input_files/gene_info_table.csv",
    genes = "Genecorpus-30M/example_input_files/gene_classification/dosage_sensitive_tfs/dosage_sens_tf_labels.csv",
   corpus_30M = "Genecorpus-30M/genecorpus_30M_2048.dataset/", model = '.',
-  max_input_size = 2 ** 11,
   max_lr = 5e-5,
-  freeze_layers = 4,
-  num_gpus = 1,
-  num_proc = os.cpu_count(),
-  geneformer_batch_size = 9,
-  epochs = 1,
   filter_dataset = 50_000,
-  emb_extract = True,
-  emb_layer = 0,
-  forward_batch = 200,
-  filter_data = None,
-  inference = False,
-  k_validate = True,
-  model_location = "230917_geneformer_GeneClassifier_dosageTF_L2048_B12_LR5e-05_LSlinear_WU500_E1_Oadamw_n10000_F4/",
-  skip_training = False,
-  emb_dir = 'gene_emb',
-  output_dir = None,
-  max_cells = 1000,
   num_cpus = os.cpu_count()):
     """"
     Primary Parameters
     -----------
     gene_info: path
         Path to gene mappings
     corpus_30M: path
         Path to 30M Gene Corpus
     model: path
         Path to pretrained GeneFormer model
     genes: path
         Path to csv file containing different columns of genes and the column labels
     inference: bool
         Whether the model should be used to run inference. If False, model will train with labeled data instead. Defaults to False
     k_validate: bool
         Whether the model should run k-fold validation or simply perform regular training/evaluate. Defaults to True
     skip_training: bool
         Whether the model should skip the training portion. Defaults to False
     emb_extract: bool
         WHether the model should extract embeddings for a given gene (WIP)
     Customization Parameters
     -----------
     freeze_layers: int
         Freezes x number of layers from the model. Default is 4 (2 non-frozen layers)
     filter_dataset: int
         Number of cells to filter from 30M dataset. Default is 50_000
     emb_layer: int
         What layer embeddings are extracted from. Default is 4
     filter_data: str, list
         Filters down embeddings to a single category. Default is None
     """

 # Cell classifier
 def finetune_cells(token_set = Path('geneformer/token_dictionary.pkl'), median_set = Path('geneformer/gene_median_dictionary.pkl'), pretrained_model = ".",
+ dataset = 'Genecorpus-30M/example_input_files/cell_classification/cell_type_annotation/cell_type_train_data.dataset/',
  dataset_split = None,
+  filter_cells = .005,
+  epochs = 1,
+  cpu_cores = os.cpu_count(),
+  geneformer_batch_size = 12,
+  optimizer = 'adamw',
+  max_lr = 5e-5,
+  num_gpus = torch.cuda.device_count(),
   max_input_size = 2 ** 11,
+  lr_schedule_fn = "linear",
+  warmup_steps = 500,
+  freeze_layers = 0,
+  emb_extract = False,
+  max_cells = 1000,
+  emb_layer = 0,
+  emb_filter = None,
+  emb_dir = 'embeddings',
   overwrite = True,
   label = "cell_type",
   data_filter = None,
+  forward_batch = 200, model_location = None,
+  skip_training = False,
   sample_data = 1,
+   inference = False,
+   optimize_hyperparameters = False,
    output_dir = None):
     '''
     Primary Parameters
     -------------------
     model_location: path
         Path to location of existing model to use for inference and embedding extraction
     pretrained_model: path
         Path to pretrained GeneFormer 30M model before fine-tuning
+    inference: bool
         Chooses whether to perform inference (which causes the function to return the list of similarities). Defaults to False
+    skip_training: bool
         Chooses whether to skip training the model. Defaults to False
     emb_extract: bool
         Choose whether to extract embeddings and calculate similarities. Defaults to True
     optimize_hyperparameters: bool
         Choose whether to optimize model hyperparamters. Defaults to False
     label: string
+		The label string in the formatted dataset that contains true class labels. Defaults to "label"
     Customization Parameters
     -------------------
     dataset_split: str
         How the dataset should be partitioned (if at all), and what ID should be used for partitioning
     data_filter: list
         (For embeddings and inference) Runs analysis subsets of the dataset by the ID defined by dataset_split
     label: str
         What feature should be read as a classification label
     emb_layer: int
         What layer embeddings should be extracted and compared from.
     emb_filter: ['cell1', 'cell2'...]
         Allows user to narrow down range of cells that embeddings will be extracted from.
     max_cells: int
+        How many embeddings from cells should be extracted.
     freeze_layers: int
         Number of layers should be permanently frozen during fine-tuning (starting from the first layer, 4 brings it up to the pretrained model).
     sample_data: float
         What proportion of the HF dataset should be used
     '''
    # Gene Classifier
+   def classify_genes(gene_info = "Genecorpus-30M/example_input_files/gene_info_table.csv",
    genes = "Genecorpus-30M/example_input_files/gene_classification/dosage_sensitive_tfs/dosage_sens_tf_labels.csv",
   corpus_30M = "Genecorpus-30M/genecorpus_30M_2048.dataset/", model = '.',
+  max_input_size = 2 ** 11,
   max_lr = 5e-5,
+  freeze_layers = 4,
+  num_gpus = 1,
+  num_proc = os.cpu_count(),
+  geneformer_batch_size = 9,
+  epochs = 1,
   filter_dataset = 50_000,
+  emb_extract = True,
+  emb_layer = 0,
+  forward_batch = 200,
+  filter_data = None,
+  inference = False,
+  k_validate = True,
+  model_location = "230917_geneformer_GeneClassifier_dosageTF_L2048_B12_LR5e-05_LSlinear_WU500_E1_Oadamw_n10000_F4/",
+  skip_training = False,
+  emb_dir = 'gene_emb',
+  output_dir = None,
+  max_cells = 1000,
   num_cpus = os.cpu_count()):
     """"
     Primary Parameters
     -----------
     gene_info: path
         Path to gene mappings
     corpus_30M: path
         Path to 30M Gene Corpus
     model: path
         Path to pretrained GeneFormer model
     genes: path
         Path to csv file containing different columns of genes and the column labels
     inference: bool
         Whether the model should be used to run inference. If False, model will train with labeled data instead. Defaults to False
     k_validate: bool
         Whether the model should run k-fold validation or simply perform regular training/evaluate. Defaults to True
     skip_training: bool
         Whether the model should skip the training portion. Defaults to False
     emb_extract: bool
         WHether the model should extract embeddings for a given gene (WIP)
     Customization Parameters
     -----------
     freeze_layers: int
         Freezes x number of layers from the model. Default is 4 (2 non-frozen layers)
     filter_dataset: int
         Number of cells to filter from 30M dataset. Default is 50_000
     emb_layer: int
         What layer embeddings are extracted from. Default is 4
     filter_data: str, list
         Filters down embeddings to a single category. Default is None
     """