Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Running

ribesstefano commited on Feb 15, 2024

Commit

cfa31cd

•

1 Parent(s): df1b305

Renamed data files and notebooks accordingly

Files changed (7) hide show

.gitignore CHANGED Viewed

@@ -158,3 +158,8 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/

 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+# Custom files
+data/uniprot2embedding.h5

data/PROTAC-DB-Scraped.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/PROTAC-DB.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/PROTAC-Pedia.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/poi_uniprot2sequence.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:85cfb7aadaa54b48490faed7f8d791caa38a354c26501cc213508aeb526ed189
-size 220472

notebooks/data_curation.ipynb CHANGED Viewed

@@ -116,7 +116,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
    "metadata": {},
    "outputs": [
     {
@@ -128,7 +128,7 @@
     }
    ],
    "source": [
-    "protacdb_file = os.path.join(data_dir, 'raw', 'protac.csv')\n",
     "protacdb_url = 'http://cadd.zju.edu.cn/protacdb/statics/binaryDownload/csv/protac/protac.csv'\n",
     "if os.path.exists(protacdb_file):\n",
     "    protac_df = pd.read_csv(protacdb_file).reset_index(drop=True)\n",
@@ -147,7 +147,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -277,7 +277,7 @@
    ],
    "source": [
     "scraped_protac_df = pd.read_csv(os.path.join(\n",
-    "    data_dir, 'processed', 'protac_scraped.csv'))\n",
     "# Rename columns\n",
     "old2new = {\n",
     "    \"Assay (Percent degradation)\": \"Percent degradation (%)\",\n",
@@ -938,7 +938,7 @@
     }
    ],
    "source": [
-    "df_file = os.path.join(data_dir, 'raw', 'protac_pedia_20220210.csv')\n",
     "protac_pedia_df = pd.read_csv(df_file)\n",
     "print(f'protac_pedia_df len: {len(protac_pedia_df)}')\n",
     "protac_pedia_df.head()"

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
     }
    ],
    "source": [
+    "protacdb_file = os.path.join(data_dir, 'PROTAC-DB.csv')\n",
     "protacdb_url = 'http://cadd.zju.edu.cn/protacdb/statics/binaryDownload/csv/protac/protac.csv'\n",
     "if os.path.exists(protacdb_file):\n",
     "    protac_df = pd.read_csv(protacdb_file).reset_index(drop=True)\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
    ],
    "source": [
     "scraped_protac_df = pd.read_csv(os.path.join(\n",
+    "    data_dir, 'PROTAC-DB-Scraped.csv'))\n",
     "# Rename columns\n",
     "old2new = {\n",
     "    \"Assay (Percent degradation)\": \"Percent degradation (%)\",\n",
     }
    ],
    "source": [
+    "df_file = os.path.join(data_dir, 'PROTAC-Pedia.csv')\n",
     "protac_pedia_df = pd.read_csv(df_file)\n",
     "print(f'protac_pedia_df len: {len(protac_pedia_df)}')\n",
     "protac_pedia_df.head()"

notebooks/protac_degradation_predictor.ipynb CHANGED Viewed

@@ -243,7 +243,7 @@
    "source": [
     "import pandas as pd\n",
     "\n",
-    "protac_df = pd.read_csv('../data/processed/PROTAC-Degradation-DB.csv')\n",
     "protac_df.head()"
    ]
   },
@@ -341,7 +341,23 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Protein embeddings downloaded from [Uniprot](https://www.uniprot.org/help/embeddings)."
    ]
   },
   {
@@ -394,7 +410,7 @@
     "from tqdm.auto import tqdm\n",
     "\n",
     "protein_embeddings = {}\n",
-    "with h5py.File(\"../data/raw/per-protein-embeddings.h5\", \"r\") as file:\n",
     "    print(f\"number of entries: {len(file.items()):,}\")\n",
     "    uniprots = protac_df['Uniprot'].unique().tolist()\n",
     "    uniprots += protac_df['E3 Ligase Uniprot'].unique().tolist()\n",

    "source": [
     "import pandas as pd\n",
     "\n",
+    "protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')\n",
     "protac_df.head()"
    ]
   },
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Protein embeddings downloaded from [Uniprot](https://www.uniprot.org/help/embeddings).\n",
+    "\n",
+    "Please note that running the following cell the first time might take a while."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "download_link = \"https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/embeddings/UP000005640_9606/per-protein.h5\"\n",
+    "embeddings_path = \"../data/uniprot2embedding.h5\"\n",
+    "if not os.path.exists(embeddings_path):\n",
+    "    !wget {download_link} {embeddings_path}"
    ]
   },
   {
     "from tqdm.auto import tqdm\n",
     "\n",
     "protein_embeddings = {}\n",
+    "with h5py.File(\"../data/per-protein-embeddings.h5\", \"r\") as file:\n",
     "    print(f\"number of entries: {len(file.items()):,}\")\n",
     "    uniprots = protac_df['Uniprot'].unique().tolist()\n",
     "    uniprots += protac_df['E3 Ligase Uniprot'].unique().tolist()\n",