{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "25107132",
   "metadata": {},
   "source": [
    "### Preparing train and test data splits for cell type annotation application"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "83d8d249-affe-45dd-915e-992b4b35b31a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from tqdm.notebook import tqdm\n",
    "from collections import Counter\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e3e6a2bf-44c8-4164-9ecd-1686230ea8be",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['pancreas',\n",
       " 'liver',\n",
       " 'blood',\n",
       " 'lung',\n",
       " 'spleen',\n",
       " 'placenta',\n",
       " 'colorectum',\n",
       " 'kidney',\n",
       " 'brain']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rootdir = \"/path/to/data/\"\n",
    "\n",
    "# collect panel of tissues to test\n",
    "dir_list = []\n",
    "for dir_i in os.listdir(rootdir):\n",
    "    if (\"results\" not in dir_i) & (os.path.isdir(os.path.join(rootdir, dir_i))):\n",
    "        dir_list += [dir_i]\n",
    "dir_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0b205eec-a518-472a-ab90-dd63ef9803cd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>filter_pass</th>\n",
       "      <th>original_cell_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>C_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>C_2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>C_3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>C_4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>C_5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9590</th>\n",
       "      <td>1</td>\n",
       "      <td>C_9591</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9591</th>\n",
       "      <td>1</td>\n",
       "      <td>C_9592</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9592</th>\n",
       "      <td>1</td>\n",
       "      <td>C_9593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9593</th>\n",
       "      <td>1</td>\n",
       "      <td>C_9594</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9594</th>\n",
       "      <td>1</td>\n",
       "      <td>C_9595</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>9595 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      filter_pass original_cell_id\n",
       "0               0              C_1\n",
       "1               1              C_2\n",
       "2               0              C_3\n",
       "3               1              C_4\n",
       "4               0              C_5\n",
       "...           ...              ...\n",
       "9590            1           C_9591\n",
       "9591            1           C_9592\n",
       "9592            1           C_9593\n",
       "9593            1           C_9594\n",
       "9594            1           C_9595\n",
       "\n",
       "[9595 rows x 2 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# dictionary of cell barcodes that passed QC filtering applied by Geneformer \n",
    "# to ensure same cells were used for comparison\n",
    "with open(f\"{rootdir}deepsort_filter_dict.pickle\", \"rb\") as fp:\n",
    "    filter_dict = pickle.load(fp)\n",
    "\n",
    "# for example:\n",
    "filter_dict[\"human_Placenta9595_data\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "207e3571-0236-4493-83b3-a89b67b16cb2",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "for dir_name in tqdm(dir_list):\n",
    "\n",
    "    df = pd.DataFrame()\n",
    "    ct_df = pd.DataFrame(columns=[\"Cell\",\"Cell_type\"])\n",
    "    \n",
    "    subrootdir = f\"{rootdir}{dir_name}/\"\n",
    "    for subdir, dirs, files in os.walk(subrootdir):\n",
    "        for i in range(len(files)):\n",
    "            file = files[i]\n",
    "            if file.endswith(\"_data.csv\"):\n",
    "                file_prefix = file.replace(\"_data.csv\",\"\")\n",
    "                sample_prefix = file.replace(\".csv\",\"\")\n",
    "                filter_df = filter_dict[sample_prefix]\n",
    "                sample_to_analyze = list(filter_df[filter_df[\"filter_pass\"]==1][\"original_cell_id\"])\n",
    "                \n",
    "                # collect data for each tissue\n",
    "                df_i = pd.read_csv(f\"{subrootdir}{file}\", index_col=0)\n",
    "                df_i = df_i[sample_to_analyze]\n",
    "                df_i.columns = [f\"{i}_{cell_id}\" for cell_id in df_i.columns]\n",
    "                df = pd.concat([df,df_i],axis=1)\n",
    "                \n",
    "                # collect cell type metadata\n",
    "                ct_df_i = pd.read_csv(f\"{subrootdir}{file_prefix}_celltype.csv\", index_col=0)\n",
    "                ct_df_i.columns = [\"Cell\",\"Cell_type\"]\n",
    "                ct_df_i[\"Cell\"] = [f\"{i}_{cell_id}\" for cell_id in ct_df_i[\"Cell\"]]\n",
    "                ct_df = pd.concat([ct_df,ct_df_i],axis=0)\n",
    "        \n",
    "    # per published scDeepsort method, filter data for cell types >0.5% of data\n",
    "    ct_counts = Counter(ct_df[\"Cell_type\"])\n",
    "    total_count = sum(ct_counts.values())\n",
    "    nonrare_cell_types = [cell_type for cell_type,count in ct_counts.items() if count>(total_count*0.005)]\n",
    "    nonrare_cells = list(ct_df[ct_df[\"Cell_type\"].isin(nonrare_cell_types)][\"Cell\"])\n",
    "    df = df[df.columns.intersection(nonrare_cells)]\n",
    "\n",
    "    # split into 80/20 train/test data\n",
    "    train, test = train_test_split(df.T, test_size=0.2)\n",
    "    train = train.T\n",
    "    test = test.T  \n",
    "    \n",
    "    # save filtered train/test data\n",
    "    train.to_csv(f\"{subrootdir}{dir_name}_filtered_data_train.csv\")\n",
    "    test.to_csv(f\"{subrootdir}{dir_name}_filtered_data_test.csv\")\n",
    "\n",
    "    # split metadata into train/test data\n",
    "    ct_df_train = ct_df[ct_df[\"Cell\"].isin(list(train.columns))]\n",
    "    ct_df_test = ct_df[ct_df[\"Cell\"].isin(list(test.columns))]\n",
    "    train_order_dict = dict(zip(train.columns,[i for i in range(len(train.columns))]))\n",
    "    test_order_dict = dict(zip(test.columns,[i for i in range(len(test.columns))]))\n",
    "    ct_df_train[\"order\"] = [train_order_dict[cell_id] for cell_id in ct_df_train[\"Cell\"]]\n",
    "    ct_df_test[\"order\"] = [test_order_dict[cell_id] for cell_id in ct_df_test[\"Cell\"]]\n",
    "    ct_df_train = ct_df_train.sort_values(\"order\")\n",
    "    ct_df_test = ct_df_test.sort_values(\"order\")\n",
    "    ct_df_train = ct_df_train.drop(\"order\",axis=1)\n",
    "    ct_df_test = ct_df_test.drop(\"order\",axis=1)\n",
    "    assert list(ct_df_train[\"Cell\"]) == list(train.columns)\n",
    "    assert list(ct_df_test[\"Cell\"]) == list(test.columns)\n",
    "    train_labels = list(Counter(ct_df_train[\"Cell_type\"]).keys())\n",
    "    test_labels = list(Counter(ct_df_test[\"Cell_type\"]).keys())\n",
    "    assert set(train_labels) == set(test_labels)\n",
    "    \n",
    "    # save train/test cell type annotations\n",
    "    ct_df_train.to_csv(f\"{subrootdir}{dir_name}_filtered_celltype_train.csv\")\n",
    "    ct_df_test.to_csv(f\"{subrootdir}{dir_name}_filtered_celltype_test.csv\")\n",
    "                "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.6 64-bit ('3.8.6')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  },
  "vscode": {
   "interpreter": {
    "hash": "eba1599a1f7e611c14c87ccff6793920aa63510b01fc0e229d6dd014149b8829"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}