{ "cells": [ { "cell_type": "markdown", "id": "25107132", "metadata": {}, "source": [ "### Preparing train and test data splits for cell type annotation application" ] }, { "cell_type": "code", "execution_count": 3, "id": "83d8d249-affe-45dd-915e-992b4b35b31a", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from tqdm.notebook import tqdm\n", "from collections import Counter\n", "import pickle" ] }, { "cell_type": "code", "execution_count": 4, "id": "e3e6a2bf-44c8-4164-9ecd-1686230ea8be", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['pancreas',\n", " 'liver',\n", " 'blood',\n", " 'lung',\n", " 'spleen',\n", " 'placenta',\n", " 'colorectum',\n", " 'kidney',\n", " 'brain']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rootdir = \"/path/to/data/\"\n", "\n", "# collect panel of tissues to test\n", "dir_list = []\n", "for dir_i in os.listdir(rootdir):\n", " if (\"results\" not in dir_i) & (os.path.isdir(os.path.join(rootdir, dir_i))):\n", " dir_list += [dir_i]\n", "dir_list" ] }, { "cell_type": "code", "execution_count": 5, "id": "0b205eec-a518-472a-ab90-dd63ef9803cd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | filter_pass | \n", "original_cell_id | \n", "
---|---|---|
0 | \n", "0 | \n", "C_1 | \n", "
1 | \n", "1 | \n", "C_2 | \n", "
2 | \n", "0 | \n", "C_3 | \n", "
3 | \n", "1 | \n", "C_4 | \n", "
4 | \n", "0 | \n", "C_5 | \n", "
... | \n", "... | \n", "... | \n", "
9590 | \n", "1 | \n", "C_9591 | \n", "
9591 | \n", "1 | \n", "C_9592 | \n", "
9592 | \n", "1 | \n", "C_9593 | \n", "
9593 | \n", "1 | \n", "C_9594 | \n", "
9594 | \n", "1 | \n", "C_9595 | \n", "
9595 rows × 2 columns
\n", "