ribesstefano
commited on
Commit
•
cfa31cd
1
Parent(s):
df1b305
Renamed data files and notebooks accordingly
Browse files- .gitignore +5 -0
- data/PROTAC-DB-Scraped.csv +0 -0
- data/PROTAC-DB.csv +0 -0
- data/PROTAC-Pedia.csv +0 -0
- data/poi_uniprot2sequence.pkl +0 -3
- notebooks/data_curation.ipynb +5 -5
- notebooks/protac_degradation_predictor.ipynb +19 -3
.gitignore
CHANGED
@@ -158,3 +158,8 @@ cython_debug/
|
|
158 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
#.idea/
|
|
|
|
|
|
|
|
|
|
|
|
158 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
#.idea/
|
161 |
+
|
162 |
+
|
163 |
+
# Custom files
|
164 |
+
|
165 |
+
data/uniprot2embedding.h5
|
data/PROTAC-DB-Scraped.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/PROTAC-DB.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/PROTAC-Pedia.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/poi_uniprot2sequence.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:85cfb7aadaa54b48490faed7f8d791caa38a354c26501cc213508aeb526ed189
|
3 |
-
size 220472
|
|
|
|
|
|
|
|
notebooks/data_curation.ipynb
CHANGED
@@ -116,7 +116,7 @@
|
|
116 |
},
|
117 |
{
|
118 |
"cell_type": "code",
|
119 |
-
"execution_count":
|
120 |
"metadata": {},
|
121 |
"outputs": [
|
122 |
{
|
@@ -128,7 +128,7 @@
|
|
128 |
}
|
129 |
],
|
130 |
"source": [
|
131 |
-
"protacdb_file = os.path.join(data_dir, '
|
132 |
"protacdb_url = 'http://cadd.zju.edu.cn/protacdb/statics/binaryDownload/csv/protac/protac.csv'\n",
|
133 |
"if os.path.exists(protacdb_file):\n",
|
134 |
" protac_df = pd.read_csv(protacdb_file).reset_index(drop=True)\n",
|
@@ -147,7 +147,7 @@
|
|
147 |
},
|
148 |
{
|
149 |
"cell_type": "code",
|
150 |
-
"execution_count":
|
151 |
"metadata": {},
|
152 |
"outputs": [
|
153 |
{
|
@@ -277,7 +277,7 @@
|
|
277 |
],
|
278 |
"source": [
|
279 |
"scraped_protac_df = pd.read_csv(os.path.join(\n",
|
280 |
-
" data_dir, '
|
281 |
"# Rename columns\n",
|
282 |
"old2new = {\n",
|
283 |
" \"Assay (Percent degradation)\": \"Percent degradation (%)\",\n",
|
@@ -938,7 +938,7 @@
|
|
938 |
}
|
939 |
],
|
940 |
"source": [
|
941 |
-
"df_file = os.path.join(data_dir, '
|
942 |
"protac_pedia_df = pd.read_csv(df_file)\n",
|
943 |
"print(f'protac_pedia_df len: {len(protac_pedia_df)}')\n",
|
944 |
"protac_pedia_df.head()"
|
|
|
116 |
},
|
117 |
{
|
118 |
"cell_type": "code",
|
119 |
+
"execution_count": null,
|
120 |
"metadata": {},
|
121 |
"outputs": [
|
122 |
{
|
|
|
128 |
}
|
129 |
],
|
130 |
"source": [
|
131 |
+
"protacdb_file = os.path.join(data_dir, 'PROTAC-DB.csv')\n",
|
132 |
"protacdb_url = 'http://cadd.zju.edu.cn/protacdb/statics/binaryDownload/csv/protac/protac.csv'\n",
|
133 |
"if os.path.exists(protacdb_file):\n",
|
134 |
" protac_df = pd.read_csv(protacdb_file).reset_index(drop=True)\n",
|
|
|
147 |
},
|
148 |
{
|
149 |
"cell_type": "code",
|
150 |
+
"execution_count": null,
|
151 |
"metadata": {},
|
152 |
"outputs": [
|
153 |
{
|
|
|
277 |
],
|
278 |
"source": [
|
279 |
"scraped_protac_df = pd.read_csv(os.path.join(\n",
|
280 |
+
" data_dir, 'PROTAC-DB-Scraped.csv'))\n",
|
281 |
"# Rename columns\n",
|
282 |
"old2new = {\n",
|
283 |
" \"Assay (Percent degradation)\": \"Percent degradation (%)\",\n",
|
|
|
938 |
}
|
939 |
],
|
940 |
"source": [
|
941 |
+
"df_file = os.path.join(data_dir, 'PROTAC-Pedia.csv')\n",
|
942 |
"protac_pedia_df = pd.read_csv(df_file)\n",
|
943 |
"print(f'protac_pedia_df len: {len(protac_pedia_df)}')\n",
|
944 |
"protac_pedia_df.head()"
|
notebooks/protac_degradation_predictor.ipynb
CHANGED
@@ -243,7 +243,7 @@
|
|
243 |
"source": [
|
244 |
"import pandas as pd\n",
|
245 |
"\n",
|
246 |
-
"protac_df = pd.read_csv('../data/
|
247 |
"protac_df.head()"
|
248 |
]
|
249 |
},
|
@@ -341,7 +341,23 @@
|
|
341 |
"cell_type": "markdown",
|
342 |
"metadata": {},
|
343 |
"source": [
|
344 |
-
"Protein embeddings downloaded from [Uniprot](https://www.uniprot.org/help/embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
345 |
]
|
346 |
},
|
347 |
{
|
@@ -394,7 +410,7 @@
|
|
394 |
"from tqdm.auto import tqdm\n",
|
395 |
"\n",
|
396 |
"protein_embeddings = {}\n",
|
397 |
-
"with h5py.File(\"../data/
|
398 |
" print(f\"number of entries: {len(file.items()):,}\")\n",
|
399 |
" uniprots = protac_df['Uniprot'].unique().tolist()\n",
|
400 |
" uniprots += protac_df['E3 Ligase Uniprot'].unique().tolist()\n",
|
|
|
243 |
"source": [
|
244 |
"import pandas as pd\n",
|
245 |
"\n",
|
246 |
+
"protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')\n",
|
247 |
"protac_df.head()"
|
248 |
]
|
249 |
},
|
|
|
341 |
"cell_type": "markdown",
|
342 |
"metadata": {},
|
343 |
"source": [
|
344 |
+
"Protein embeddings downloaded from [Uniprot](https://www.uniprot.org/help/embeddings).\n",
|
345 |
+
"\n",
|
346 |
+
"Please note that running the following cell the first time might take a while."
|
347 |
+
]
|
348 |
+
},
|
349 |
+
{
|
350 |
+
"cell_type": "code",
|
351 |
+
"execution_count": null,
|
352 |
+
"metadata": {},
|
353 |
+
"outputs": [],
|
354 |
+
"source": [
|
355 |
+
"import os\n",
|
356 |
+
"\n",
|
357 |
+
"download_link = \"https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/embeddings/UP000005640_9606/per-protein.h5\"\n",
|
358 |
+
"embeddings_path = \"../data/uniprot2embedding.h5\"\n",
|
359 |
+
"if not os.path.exists(embeddings_path):\n",
|
360 |
+
" !wget {download_link} {embeddings_path}"
|
361 |
]
|
362 |
},
|
363 |
{
|
|
|
410 |
"from tqdm.auto import tqdm\n",
|
411 |
"\n",
|
412 |
"protein_embeddings = {}\n",
|
413 |
+
"with h5py.File(\"../data/per-protein-embeddings.h5\", \"r\") as file:\n",
|
414 |
" print(f\"number of entries: {len(file.items()):,}\")\n",
|
415 |
" uniprots = protac_df['Uniprot'].unique().tolist()\n",
|
416 |
" uniprots += protac_df['E3 Ligase Uniprot'].unique().tolist()\n",
|