{ "cells": [ { "cell_type": "markdown", "id": "25107132", "metadata": {}, "source": [ "### Preparing train and test data splits for cell type annotation application" ] }, { "cell_type": "code", "execution_count": 3, "id": "83d8d249-affe-45dd-915e-992b4b35b31a", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from tqdm.notebook import tqdm\n", "from collections import Counter\n", "import pickle" ] }, { "cell_type": "code", "execution_count": 4, "id": "e3e6a2bf-44c8-4164-9ecd-1686230ea8be", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['pancreas',\n", " 'liver',\n", " 'blood',\n", " 'lung',\n", " 'spleen',\n", " 'placenta',\n", " 'colorectum',\n", " 'kidney',\n", " 'brain']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rootdir = \"/path/to/data/\"\n", "\n", "# collect panel of tissues to test\n", "dir_list = []\n", "for dir_i in os.listdir(rootdir):\n", " if (\"results\" not in dir_i) & (os.path.isdir(os.path.join(rootdir, dir_i))):\n", " dir_list += [dir_i]\n", "dir_list" ] }, { "cell_type": "code", "execution_count": 5, "id": "0b205eec-a518-472a-ab90-dd63ef9803cd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
filter_passoriginal_cell_id
00C_1
11C_2
20C_3
31C_4
40C_5
.........
95901C_9591
95911C_9592
95921C_9593
95931C_9594
95941C_9595
\n", "

9595 rows × 2 columns

\n", "
" ], "text/plain": [ " filter_pass original_cell_id\n", "0 0 C_1\n", "1 1 C_2\n", "2 0 C_3\n", "3 1 C_4\n", "4 0 C_5\n", "... ... ...\n", "9590 1 C_9591\n", "9591 1 C_9592\n", "9592 1 C_9593\n", "9593 1 C_9594\n", "9594 1 C_9595\n", "\n", "[9595 rows x 2 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# dictionary of cell barcodes that passed QC filtering applied by Geneformer \n", "# to ensure same cells were used for comparison\n", "with open(f\"{rootdir}deepsort_filter_dict.pickle\", \"rb\") as fp:\n", " filter_dict = pickle.load(fp)\n", "\n", "# for example:\n", "filter_dict[\"human_Placenta9595_data\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "207e3571-0236-4493-83b3-a89b67b16cb2", "metadata": { "tags": [] }, "outputs": [], "source": [ "for dir_name in tqdm(dir_list):\n", "\n", " df = pd.DataFrame()\n", " ct_df = pd.DataFrame(columns=[\"Cell\",\"Cell_type\"])\n", " \n", " subrootdir = f\"{rootdir}{dir_name}/\"\n", " for subdir, dirs, files in os.walk(subrootdir):\n", " for i in range(len(files)):\n", " file = files[i]\n", " if file.endswith(\"_data.csv\"):\n", " file_prefix = file.replace(\"_data.csv\",\"\")\n", " sample_prefix = file.replace(\".csv\",\"\")\n", " filter_df = filter_dict[sample_prefix]\n", " sample_to_analyze = list(filter_df[filter_df[\"filter_pass\"]==1][\"original_cell_id\"])\n", " \n", " # collect data for each tissue\n", " df_i = pd.read_csv(f\"{subrootdir}{file}\", index_col=0)\n", " df_i = df_i[sample_to_analyze]\n", " df_i.columns = [f\"{i}_{cell_id}\" for cell_id in df_i.columns]\n", " df = pd.concat([df,df_i],axis=1)\n", " \n", " # collect cell type metadata\n", " ct_df_i = pd.read_csv(f\"{subrootdir}{file_prefix}_celltype.csv\", index_col=0)\n", " ct_df_i.columns = [\"Cell\",\"Cell_type\"]\n", " ct_df_i[\"Cell\"] = [f\"{i}_{cell_id}\" for cell_id in ct_df_i[\"Cell\"]]\n", " ct_df = pd.concat([ct_df,ct_df_i],axis=0)\n", " \n", " # per published scDeepsort method, filter data for cell types >0.5% of data\n", " ct_counts = Counter(ct_df[\"Cell_type\"])\n", " total_count = sum(ct_counts.values())\n", " nonrare_cell_types = [cell_type for cell_type,count in ct_counts.items() if count>(total_count*0.005)]\n", " nonrare_cells = list(ct_df[ct_df[\"Cell_type\"].isin(nonrare_cell_types)][\"Cell\"])\n", " df = df[df.columns.intersection(nonrare_cells)]\n", "\n", " # split into 80/20 train/test data\n", " train, test = train_test_split(df.T, test_size=0.2)\n", " train = train.T\n", " test = test.T \n", " \n", " # save filtered train/test data\n", " train.to_csv(f\"{subrootdir}{dir_name}_filtered_data_train.csv\")\n", " test.to_csv(f\"{subrootdir}{dir_name}_filtered_data_test.csv\")\n", "\n", " # split metadata into train/test data\n", " ct_df_train = ct_df[ct_df[\"Cell\"].isin(list(train.columns))]\n", " ct_df_test = ct_df[ct_df[\"Cell\"].isin(list(test.columns))]\n", " train_order_dict = dict(zip(train.columns,[i for i in range(len(train.columns))]))\n", " test_order_dict = dict(zip(test.columns,[i for i in range(len(test.columns))]))\n", " ct_df_train[\"order\"] = [train_order_dict[cell_id] for cell_id in ct_df_train[\"Cell\"]]\n", " ct_df_test[\"order\"] = [test_order_dict[cell_id] for cell_id in ct_df_test[\"Cell\"]]\n", " ct_df_train = ct_df_train.sort_values(\"order\")\n", " ct_df_test = ct_df_test.sort_values(\"order\")\n", " ct_df_train = ct_df_train.drop(\"order\",axis=1)\n", " ct_df_test = ct_df_test.drop(\"order\",axis=1)\n", " assert list(ct_df_train[\"Cell\"]) == list(train.columns)\n", " assert list(ct_df_test[\"Cell\"]) == list(test.columns)\n", " train_labels = list(Counter(ct_df_train[\"Cell_type\"]).keys())\n", " test_labels = list(Counter(ct_df_test[\"Cell_type\"]).keys())\n", " assert set(train_labels) == set(test_labels)\n", " \n", " # save train/test cell type annotations\n", " ct_df_train.to_csv(f\"{subrootdir}{dir_name}_filtered_celltype_train.csv\")\n", " ct_df_test.to_csv(f\"{subrootdir}{dir_name}_filtered_celltype_test.csv\")\n", " " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.6 64-bit ('3.8.6')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" }, "vscode": { "interpreter": { "hash": "eba1599a1f7e611c14c87ccff6793920aa63510b01fc0e229d6dd014149b8829" } } }, "nbformat": 4, "nbformat_minor": 5 }