ribesstefano commited on
Commit
cfa31cd
1 Parent(s): df1b305

Renamed data files and notebooks accordingly

Browse files
.gitignore CHANGED
@@ -158,3 +158,8 @@ cython_debug/
158
  # and can be added to the global gitignore or merged into this file. For a more nuclear
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
  #.idea/
 
 
 
 
 
 
158
  # and can be added to the global gitignore or merged into this file. For a more nuclear
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
  #.idea/
161
+
162
+
163
+ # Custom files
164
+
165
+ data/uniprot2embedding.h5
data/PROTAC-DB-Scraped.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/PROTAC-DB.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/PROTAC-Pedia.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/poi_uniprot2sequence.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:85cfb7aadaa54b48490faed7f8d791caa38a354c26501cc213508aeb526ed189
3
- size 220472
 
 
 
 
notebooks/data_curation.ipynb CHANGED
@@ -116,7 +116,7 @@
116
  },
117
  {
118
  "cell_type": "code",
119
- "execution_count": 71,
120
  "metadata": {},
121
  "outputs": [
122
  {
@@ -128,7 +128,7 @@
128
  }
129
  ],
130
  "source": [
131
- "protacdb_file = os.path.join(data_dir, 'raw', 'protac.csv')\n",
132
  "protacdb_url = 'http://cadd.zju.edu.cn/protacdb/statics/binaryDownload/csv/protac/protac.csv'\n",
133
  "if os.path.exists(protacdb_file):\n",
134
  " protac_df = pd.read_csv(protacdb_file).reset_index(drop=True)\n",
@@ -147,7 +147,7 @@
147
  },
148
  {
149
  "cell_type": "code",
150
- "execution_count": 6,
151
  "metadata": {},
152
  "outputs": [
153
  {
@@ -277,7 +277,7 @@
277
  ],
278
  "source": [
279
  "scraped_protac_df = pd.read_csv(os.path.join(\n",
280
- " data_dir, 'processed', 'protac_scraped.csv'))\n",
281
  "# Rename columns\n",
282
  "old2new = {\n",
283
  " \"Assay (Percent degradation)\": \"Percent degradation (%)\",\n",
@@ -938,7 +938,7 @@
938
  }
939
  ],
940
  "source": [
941
- "df_file = os.path.join(data_dir, 'raw', 'protac_pedia_20220210.csv')\n",
942
  "protac_pedia_df = pd.read_csv(df_file)\n",
943
  "print(f'protac_pedia_df len: {len(protac_pedia_df)}')\n",
944
  "protac_pedia_df.head()"
 
116
  },
117
  {
118
  "cell_type": "code",
119
+ "execution_count": null,
120
  "metadata": {},
121
  "outputs": [
122
  {
 
128
  }
129
  ],
130
  "source": [
131
+ "protacdb_file = os.path.join(data_dir, 'PROTAC-DB.csv')\n",
132
  "protacdb_url = 'http://cadd.zju.edu.cn/protacdb/statics/binaryDownload/csv/protac/protac.csv'\n",
133
  "if os.path.exists(protacdb_file):\n",
134
  " protac_df = pd.read_csv(protacdb_file).reset_index(drop=True)\n",
 
147
  },
148
  {
149
  "cell_type": "code",
150
+ "execution_count": null,
151
  "metadata": {},
152
  "outputs": [
153
  {
 
277
  ],
278
  "source": [
279
  "scraped_protac_df = pd.read_csv(os.path.join(\n",
280
+ " data_dir, 'PROTAC-DB-Scraped.csv'))\n",
281
  "# Rename columns\n",
282
  "old2new = {\n",
283
  " \"Assay (Percent degradation)\": \"Percent degradation (%)\",\n",
 
938
  }
939
  ],
940
  "source": [
941
+ "df_file = os.path.join(data_dir, 'PROTAC-Pedia.csv')\n",
942
  "protac_pedia_df = pd.read_csv(df_file)\n",
943
  "print(f'protac_pedia_df len: {len(protac_pedia_df)}')\n",
944
  "protac_pedia_df.head()"
notebooks/protac_degradation_predictor.ipynb CHANGED
@@ -243,7 +243,7 @@
243
  "source": [
244
  "import pandas as pd\n",
245
  "\n",
246
- "protac_df = pd.read_csv('../data/processed/PROTAC-Degradation-DB.csv')\n",
247
  "protac_df.head()"
248
  ]
249
  },
@@ -341,7 +341,23 @@
341
  "cell_type": "markdown",
342
  "metadata": {},
343
  "source": [
344
- "Protein embeddings downloaded from [Uniprot](https://www.uniprot.org/help/embeddings)."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  ]
346
  },
347
  {
@@ -394,7 +410,7 @@
394
  "from tqdm.auto import tqdm\n",
395
  "\n",
396
  "protein_embeddings = {}\n",
397
- "with h5py.File(\"../data/raw/per-protein-embeddings.h5\", \"r\") as file:\n",
398
  " print(f\"number of entries: {len(file.items()):,}\")\n",
399
  " uniprots = protac_df['Uniprot'].unique().tolist()\n",
400
  " uniprots += protac_df['E3 Ligase Uniprot'].unique().tolist()\n",
 
243
  "source": [
244
  "import pandas as pd\n",
245
  "\n",
246
+ "protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')\n",
247
  "protac_df.head()"
248
  ]
249
  },
 
341
  "cell_type": "markdown",
342
  "metadata": {},
343
  "source": [
344
+ "Protein embeddings downloaded from [Uniprot](https://www.uniprot.org/help/embeddings).\n",
345
+ "\n",
346
+ "Please note that running the following cell the first time might take a while."
347
+ ]
348
+ },
349
+ {
350
+ "cell_type": "code",
351
+ "execution_count": null,
352
+ "metadata": {},
353
+ "outputs": [],
354
+ "source": [
355
+ "import os\n",
356
+ "\n",
357
+ "download_link = \"https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/embeddings/UP000005640_9606/per-protein.h5\"\n",
358
+ "embeddings_path = \"../data/uniprot2embedding.h5\"\n",
359
+ "if not os.path.exists(embeddings_path):\n",
360
+ " !wget {download_link} {embeddings_path}"
361
  ]
362
  },
363
  {
 
410
  "from tqdm.auto import tqdm\n",
411
  "\n",
412
  "protein_embeddings = {}\n",
413
+ "with h5py.File(\"../data/per-protein-embeddings.h5\", \"r\") as file:\n",
414
  " print(f\"number of entries: {len(file.items()):,}\")\n",
415
  " uniprots = protac_df['Uniprot'].unique().tolist()\n",
416
  " uniprots += protac_df['E3 Ligase Uniprot'].unique().tolist()\n",