{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "e10ac0c9-40ce-41fb-b6fa-3d62b76f2e57", "metadata": {}, "outputs": [], "source": [ "from geneformer import InSilicoPerturber\n", "from geneformer import InSilicoPerturberStats\n", "from geneformer import EmbExtractor" ] }, { "cell_type": "markdown", "id": "cbd6851c-060e-4967-b816-e605ffe58b23", "metadata": { "tags": [] }, "source": [ "### in silico perturbation in deletion mode to determine genes whose deletion in the dilated cardiomyopathy (dcm) state significantly shifts the embedding towards non-failing (nf) state" ] }, { "cell_type": "code", "execution_count": null, "id": "c53e98cd-c603-4878-82ba-db471181bb55", "metadata": {}, "outputs": [], "source": [ "# first obtain start, goal, and alt embedding positions\n", "# this function was changed to be separate from perturb_data\n", "# to avoid repeating calcuations when parallelizing perturb_data\n", "cell_states_to_model={\"state_key\": \"disease\", \n", " \"start_state\": \"dcm\", \n", " \"goal_state\": \"nf\", \n", " \"alt_states\": [\"hcm\"]}\n", "\n", "filter_data_dict={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]}\n", "\n", "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n", "# (otherwise the EmbExtractor will use the current default model dictionary)\n", "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n", "embex = EmbExtractor(model_type=\"CellClassifier\", # if using previously fine-tuned cell classifier model\n", " num_classes=3,\n", " filter_data=filter_data_dict,\n", " max_ncells=1000,\n", " emb_layer=0,\n", " summary_stat=\"exact_mean\",\n", " forward_batch_size=256,\n", " nproc=16)\n", "\n", "state_embs_dict = embex.get_state_embs(cell_states_to_model,\n", " \"../fine_tuned_models/gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224\", # example 30M fine-tuned model\n", " \"path/to/input_data\",\n", " \"path/to/output_directory\",\n", " \"output_prefix\")" ] }, { "cell_type": "code", "execution_count": null, "id": "981e1190-62da-4543-b7d3-6e2a2d6a6d56", "metadata": { "tags": [] }, "outputs": [], "source": [ "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n", "# (otherwise the InSilicoPerturber will use the current default model dictionary)\n", "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n", "isp = InSilicoPerturber(perturb_type=\"delete\",\n", " perturb_rank_shift=None,\n", " genes_to_perturb=\"all\",\n", " combos=0,\n", " anchor_gene=None,\n", " model_type=\"CellClassifier\", # if using previously fine-tuned cell classifier model\n", " num_classes=3,\n", " emb_mode=\"cell\",\n", " cell_emb_style=\"mean_pool\",\n", " filter_data=filter_data_dict,\n", " cell_states_to_model=cell_states_to_model,\n", " state_embs_dict=state_embs_dict,\n", " max_ncells=2000,\n", " emb_layer=0,\n", " forward_batch_size=400,\n", " nproc=16)" ] }, { "cell_type": "code", "execution_count": null, "id": "0525a663-871a-4ce0-a135-cc203817ffa9", "metadata": {}, "outputs": [], "source": [ "# outputs intermediate files from in silico perturbation\n", "\n", "isp.perturb_data(\"../fine_tuned_models/gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224\", # example 30M fine-tuned model\n", " \"path/to/input_data\",\n", " \"path/to/isp_output_directory\",\n", " \"output_prefix\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f8aadabb-516a-4dc0-b307-6de880e64e26", "metadata": {}, "outputs": [], "source": [ "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n", "# (otherwise the InSilicoPerturberStats will use the current default model dictionary)\n", "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n", "ispstats = InSilicoPerturberStats(mode=\"goal_state_shift\",\n", " genes_perturbed=\"all\",\n", " combos=0,\n", " anchor_gene=None,\n", " cell_states_to_model=cell_states_to_model)" ] }, { "cell_type": "code", "execution_count": null, "id": "ffecfae6-e737-43e3-99e9-fa37ff46610b", "metadata": {}, "outputs": [], "source": [ "# extracts data from intermediate files and processes stats to output in final .csv\n", "ispstats.get_stats(\"path/to/isp_output_directory\", # this should be the directory \n", " None,\n", " \"path/to/isp_stats_output_directory\",\n", " \"output_prefix\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 5 }