Spaces:
Running
on
T4
Running
on
T4
Simon Duerr
commited on
Commit
•
3279dbd
1
Parent(s):
8b47127
cpu requirements
Browse files- app.py +189 -86
- requirements.txt +3 -3
app.py
CHANGED
@@ -69,10 +69,10 @@ def clear_mem():
|
|
69 |
buf.delete()
|
70 |
|
71 |
|
72 |
-
print("Is cuda available",torch.cuda.is_available())
|
73 |
-
stream = os.popen(
|
74 |
-
output = stream.read()
|
75 |
-
print(output)
|
76 |
|
77 |
|
78 |
def setup_af(seq, model_name="model_5_ptm"):
|
@@ -177,7 +177,7 @@ def align_structures(pdb1, pdb2, lenRes, index):
|
|
177 |
io.set_structure(sample_structure)
|
178 |
io.save(f"outputs/out_{index}_aligned.pdb")
|
179 |
# Doing this to get around biopython CEALIGN bug
|
180 |
-
#subprocess.call("pymol -c -Q -r cealign.pml", shell=True)
|
181 |
|
182 |
return aligner.rms, "outputs/reference.pdb", f"outputs/out_{index}_aligned.pdb"
|
183 |
|
@@ -219,7 +219,7 @@ def run_alphafold(sequences, num_recycles):
|
|
219 |
}
|
220 |
|
221 |
positions = []
|
222 |
-
|
223 |
for r in range(recycles + 1):
|
224 |
outs = RUNNER(x, OPT)
|
225 |
outs = jax.tree_map(lambda x: np.asarray(x), outs)
|
@@ -228,10 +228,12 @@ def run_alphafold(sequences, num_recycles):
|
|
228 |
plddts.append(outs["plddt"][:LEN])
|
229 |
paes.append(outs["pae"])
|
230 |
if os.path.exists("/home/duerr/phd/08_Code/ProteinMPNN"):
|
231 |
-
save_pdb(
|
|
|
|
|
232 |
else:
|
233 |
save_pdb(outs, f"/home/user/app/outputs/out_{i}.pdb", LEN)
|
234 |
-
return plddts,paes, LEN
|
235 |
|
236 |
|
237 |
if os.path.exists("/home/duerr/phd/08_Code/ProteinMPNN"):
|
@@ -308,7 +310,7 @@ def preprocess_mol(pdb_code="", filepath=""):
|
|
308 |
return None
|
309 |
else:
|
310 |
mol = Molecule(pdb_code)
|
311 |
-
mol.write(
|
312 |
# clean messy files and only include protein itself
|
313 |
mol.filter("protein")
|
314 |
# renumber using moleculekit 0...len(protein)
|
@@ -324,70 +326,73 @@ def preprocess_mol(pdb_code="", filepath=""):
|
|
324 |
mol.write("cleaned.pdb")
|
325 |
return "cleaned.pdb", df
|
326 |
|
|
|
327 |
def assign_sasa(mol):
|
328 |
from moleculekit.projections.metricsasa import MetricSasa
|
329 |
-
|
330 |
-
|
331 |
-
)
|
332 |
sasaR = metr.project(mol)[0]
|
333 |
is_prot = mol.atomselect("protein")
|
334 |
-
resids=pd.DataFrame.from_dict({"resid":mol.resid, "is_prot":is_prot})
|
335 |
-
new_masses=[]
|
336 |
i_without_non_prot = 0
|
337 |
-
for i, g in resids.groupby((resids[
|
338 |
-
if g["is_prot"].unique()[0]==True:
|
339 |
-
g["sasa"]=sasaR[i_without_non_prot]
|
340 |
-
i_without_non_prot+=1
|
341 |
else:
|
342 |
-
g["sasa"]=0
|
343 |
new_masses.extend(list(g.sasa))
|
344 |
return np.array(new_masses)
|
345 |
|
|
|
346 |
def process_atomsel(atomsel):
|
347 |
"""everything lowercase and replace some keywords not relevant for protein design"""
|
348 |
-
atomsel=re.sub(
|
349 |
-
atomsel=re.sub(
|
350 |
return atomsel
|
351 |
|
352 |
|
353 |
def make_fixed_positions_dict(atomsel, residue_index_df):
|
354 |
# we use the uploaded file for the selection
|
355 |
-
mol = Molecule(
|
356 |
# use index for selection as resids will change
|
357 |
|
358 |
-
|
359 |
# set sasa to 0 for all non protein atoms (all non protein atoms are deleted later)
|
360 |
mol.masses = assign_sasa(mol)
|
361 |
print(mol.masses.shape)
|
362 |
print(assign_sasa(mol).shape)
|
363 |
atomsel = process_atomsel(atomsel)
|
364 |
-
selected_residues = mol.get("index",atomsel)
|
365 |
|
366 |
# clean up
|
367 |
mol.filter("protein")
|
368 |
mol.renumberResidues()
|
369 |
# based on selected index now get resids
|
370 |
selected_residues = [str(i) for i in selected_residues]
|
371 |
-
if len(selected_residues)==0:
|
372 |
return None, []
|
373 |
selected_residues_str = " ".join(selected_residues)
|
374 |
-
selected_residues=set(mol.get(
|
375 |
|
376 |
# use the proteinMPNN index nomenclature to assemble fixed_positions_dict
|
377 |
-
fixed_positions_df = residue_index_df[
|
|
|
|
|
378 |
|
379 |
-
chains = set(mol.get(
|
380 |
-
fixed_position_dict = {
|
381 |
-
#store the selected residues in a list for the visualization later with cleaned.pdb
|
382 |
-
selected_residues = list(fixed_positions_df[
|
383 |
|
384 |
for c in chains:
|
385 |
-
fixed_position_dict[
|
386 |
|
387 |
for i, row in fixed_positions_df.iterrows():
|
388 |
-
fixed_position_dict[
|
389 |
return fixed_position_dict, selected_residues
|
390 |
|
|
|
391 |
def update(
|
392 |
inp,
|
393 |
file,
|
@@ -398,7 +403,7 @@ def update(
|
|
398 |
sampling_temp,
|
399 |
model_name,
|
400 |
backbone_noise,
|
401 |
-
atomsel
|
402 |
):
|
403 |
from protein_mpnn_utils import (
|
404 |
loss_nll,
|
@@ -471,8 +476,10 @@ def update(
|
|
471 |
if atomsel == "":
|
472 |
fixed_positions_dict, selected_residues = None, []
|
473 |
else:
|
474 |
-
fixed_positions_dict, selected_residues=make_fixed_positions_dict(
|
475 |
-
|
|
|
|
|
476 |
pssm_dict = None
|
477 |
omit_AA_dict = None
|
478 |
bias_AA_dict = None
|
@@ -489,7 +496,7 @@ def update(
|
|
489 |
tied_positions_dict = make_tied_positions_for_homomers(pdb_dict_list)
|
490 |
else:
|
491 |
tied_positions_dict = None
|
492 |
-
|
493 |
chain_id_dict = {}
|
494 |
chain_id_dict[pdb_dict_list[0]["name"]] = (designed_chain_list, fixed_chain_list)
|
495 |
with torch.no_grad():
|
@@ -738,7 +745,7 @@ def update(
|
|
738 |
)
|
739 |
seq_list.append(seq + chain_s)
|
740 |
message += f"{line}\n"
|
741 |
-
if fixed_positions_dict!=None:
|
742 |
message += f"\nfixed positions:* {fixed_positions_dict['cleaned']} \n\n*uses CHAIN:[1..len(chain)] residue numbering"
|
743 |
# somehow sequences still contain X, remove again
|
744 |
for i, x in enumerate(seq_list):
|
@@ -769,7 +776,7 @@ def update(
|
|
769 |
)
|
770 |
|
771 |
fig_tadjusted.update_xaxes(side="top")
|
772 |
-
seq_dict = {"seq_list":seq_list, "recovery":seq_recovery, "seq_score":seq_score}
|
773 |
return (
|
774 |
message,
|
775 |
fig,
|
@@ -779,17 +786,17 @@ def update(
|
|
779 |
pdb_path,
|
780 |
gr.Dropdown.update(choices=seq_list),
|
781 |
selected_residues,
|
782 |
-
seq_dict
|
783 |
)
|
784 |
|
785 |
|
786 |
-
def update_AF(seq_dict, pdb, num_recycles,selectedResidues):
|
787 |
|
788 |
# # run alphafold using ray
|
789 |
# plddts, pae, num_res = run_alphafold(
|
790 |
# startsequence, num_recycles
|
791 |
# )
|
792 |
-
allSeqs = seq_dict[
|
793 |
lenSeqs = len(allSeqs)
|
794 |
if len(allSeqs[0]) > 700:
|
795 |
return (
|
@@ -801,14 +808,22 @@ def update_AF(seq_dict, pdb, num_recycles,selectedResidues):
|
|
801 |
plt.figure(),
|
802 |
plt.figure(),
|
803 |
)
|
804 |
-
|
805 |
plddts, paes, num_res = ray.get(run_alphafold.remote(allSeqs, num_recycles))
|
806 |
|
807 |
sequences = {}
|
808 |
for i in range(lenSeqs):
|
809 |
-
rms, input_pdb, aligned_pdb = align_structures(
|
810 |
-
|
811 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
812 |
print(results)
|
813 |
plots = []
|
814 |
for index, plddts_val in enumerate(plddts):
|
@@ -837,7 +852,7 @@ def update_AF(seq_dict, pdb, num_recycles,selectedResidues):
|
|
837 |
legend=dict(yanchor="bottom", y=0.01, xanchor="left", x=0.99),
|
838 |
)
|
839 |
pae_plots = []
|
840 |
-
for i,pae in enumerate(paes):
|
841 |
plt.figure()
|
842 |
plt.title(f"Predicted Aligned Error sequence {i}")
|
843 |
Ln = pae.shape[0]
|
@@ -858,7 +873,20 @@ def update_AF(seq_dict, pdb, num_recycles,selectedResidues):
|
|
858 |
# plotAF_pae.write_html("test.html")
|
859 |
# plotAF_pae.update_layout(title="Predicted Aligned Error", template="simple_white")
|
860 |
|
861 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
862 |
|
863 |
|
864 |
def read_mol(molpath):
|
@@ -870,26 +898,39 @@ def read_mol(molpath):
|
|
870 |
return mol
|
871 |
|
872 |
|
873 |
-
def molecule(
|
|
|
|
|
874 |
|
875 |
mol = read_mol("outputs/reference.pdb")
|
876 |
-
options =""
|
877 |
pred_mol = "["
|
878 |
seqdata = "{"
|
879 |
selected = "selected"
|
880 |
for i in range(lenSeqs):
|
881 |
-
seqdata+=
|
882 |
-
|
883 |
-
|
884 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
885 |
selected = ""
|
886 |
-
if i!=lenSeqs-1:
|
887 |
-
pred_mol+=","
|
888 |
-
seqdata+=","
|
889 |
-
pred_mol+="]"
|
890 |
-
seqdata+="}"
|
891 |
-
|
892 |
-
|
893 |
|
894 |
x = (
|
895 |
"""<!DOCTYPE html>
|
@@ -934,7 +975,9 @@ def molecule(input_pdb, aligned_pdb, lenSeqs, num_res, selectedResidues, allSeqs
|
|
934 |
a sequence</label>
|
935 |
<select id="seq"
|
936 |
class="bg-gray-50 border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2.5 dark:bg-gray-700 dark:border-gray-600 dark:placeholder-gray-400 dark:text-white dark:focus:ring-blue-500 dark:focus:border-blue-500">
|
937 |
-
"""
|
|
|
|
|
938 |
</select>
|
939 |
</div>
|
940 |
<div class="font-mono bg-gray-100 py-3 px-2 font-sm rounded">
|
@@ -1033,16 +1076,18 @@ def molecule(input_pdb, aligned_pdb, lenSeqs, num_res, selectedResidues, allSeqs
|
|
1033 |
let element = null;
|
1034 |
let config = null;
|
1035 |
let currentIndex = 0;
|
1036 |
-
let seqs = """
|
|
|
|
|
1037 |
let data = """
|
1038 |
+ pred_mol
|
1039 |
+ """
|
1040 |
let pdb = `"""
|
1041 |
+ mol
|
1042 |
+ """`
|
1043 |
-
var selectedResidues = """
|
1044 |
-
|
1045 |
-
|
1046 |
//AlphaFold code from https://gist.github.com/piroyon/30d1c1099ad488a7952c3b21a5bebc96
|
1047 |
let colorAlpha = function (atom) {
|
1048 |
if (atom.b < 50) {
|
@@ -1057,7 +1102,9 @@ def molecule(input_pdb, aligned_pdb, lenSeqs, num_res, selectedResidues, allSeqs
|
|
1057 |
};
|
1058 |
|
1059 |
let colors = {}
|
1060 |
-
for (let i=0; i<"""
|
|
|
|
|
1061 |
if (selectedResidues.includes(i)){
|
1062 |
colors[i]="hotpink"
|
1063 |
}else{
|
@@ -1152,7 +1199,16 @@ def molecule(input_pdb, aligned_pdb, lenSeqs, num_res, selectedResidues, allSeqs
|
|
1152 |
|
1153 |
|
1154 |
def set_examples(example):
|
1155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1156 |
return [
|
1157 |
label,
|
1158 |
inp,
|
@@ -1161,7 +1217,7 @@ def set_examples(example):
|
|
1161 |
homomer,
|
1162 |
gr.Slider.update(value=num_seqs),
|
1163 |
gr.Radio.update(value=sampling_temp),
|
1164 |
-
atomsel
|
1165 |
]
|
1166 |
|
1167 |
|
@@ -1198,7 +1254,8 @@ with proteinMPNN:
|
|
1198 |
value=0.1,
|
1199 |
label="Sampling temperature",
|
1200 |
)
|
1201 |
-
gr.Markdown(
|
|
|
1202 |
"""
|
1203 |
)
|
1204 |
with gr.Row():
|
@@ -1220,8 +1277,9 @@ with proteinMPNN:
|
|
1220 |
gr.Markdown(
|
1221 |
"for correct symmetric tying lenghts of homomer chains should be the same"
|
1222 |
)
|
1223 |
-
gr.Markdown(
|
1224 |
-
gr.Markdown(
|
|
|
1225 |
The [VMD selection](http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html) synthax is used. You can also select based on ligands or chains in the input structure to specify interfaces to be fixed.
|
1226 |
|
1227 |
- <code>within 5 of resid 94</code> All residues that have >1 atom closer than 5 Å to any atom of residue 94
|
@@ -1235,8 +1293,11 @@ with proteinMPNN:
|
|
1235 |
- <code>pLDDT >70 </code> Redesign all residues with low pLDDT
|
1236 |
|
1237 |
Note that <code>sasa</code> and <code>pLDDT</code> selectors modify default VMD behavior. SASA is calculated using moleculekit and written to the mass attribute. Selections based on mass do not work.
|
1238 |
-
pLDDT is an alias for beta, it only works correctly with structures that contain the appropriate values in the beta column of the PDB file. """
|
1239 |
-
|
|
|
|
|
|
|
1240 |
|
1241 |
btn = gr.Button("Run")
|
1242 |
label = gr.Textbox(label="Label", visible=False)
|
@@ -1249,19 +1310,45 @@ with proteinMPNN:
|
|
1249 |
homomer,
|
1250 |
num_seqs,
|
1251 |
sampling_temp,
|
1252 |
-
atomsel
|
1253 |
],
|
1254 |
samples=[
|
1255 |
-
["Homomer design", "1O91", "A,B,C", "", True, 2, 0.1,""],
|
1256 |
-
["Monomer design", "6MRR", "A", "", False, 2, 0.1,""],
|
1257 |
["Redesign of Homomer to Heteromer", "3HTN", "A,B", "C", False, 2, 0.1, ""],
|
1258 |
-
[
|
1259 |
-
|
1260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1261 |
],
|
1262 |
)
|
1263 |
|
1264 |
-
|
1265 |
gr.Markdown("# Output")
|
1266 |
|
1267 |
with gr.Tabs():
|
@@ -1296,8 +1383,9 @@ with proteinMPNN:
|
|
1296 |
with gr.Row():
|
1297 |
with gr.Row():
|
1298 |
chosen_seq = gr.Dropdown(
|
1299 |
-
choices=[],
|
1300 |
-
|
|
|
1301 |
)
|
1302 |
num_recycles = gr.Dropdown(
|
1303 |
choices=[0, 1, 3, 5], value=3, label="num Recycles"
|
@@ -1307,10 +1395,25 @@ with proteinMPNN:
|
|
1307 |
mol = gr.HTML()
|
1308 |
with gr.Column():
|
1309 |
gr.Markdown("## Metrics")
|
1310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1311 |
plotAF_plddt = gr.Plot(label="pLDDT")
|
1312 |
# remove maxh80 class from css
|
1313 |
-
plotAF_pae = gr.Gallery(label="PAE plots")
|
1314 |
tempFile = gr.Variable()
|
1315 |
selectedResidues = gr.Variable()
|
1316 |
seq_dict = gr.Variable()
|
@@ -1337,7 +1440,7 @@ with proteinMPNN:
|
|
1337 |
tempFile,
|
1338 |
chosen_seq,
|
1339 |
selectedResidues,
|
1340 |
-
seq_dict
|
1341 |
],
|
1342 |
)
|
1343 |
btnAF.click(
|
|
|
69 |
buf.delete()
|
70 |
|
71 |
|
72 |
+
print("Is cuda available", torch.cuda.is_available())
|
73 |
+
# stream = os.popen("nvcc --version")
|
74 |
+
# output = stream.read()
|
75 |
+
# print(output)
|
76 |
|
77 |
|
78 |
def setup_af(seq, model_name="model_5_ptm"):
|
|
|
177 |
io.set_structure(sample_structure)
|
178 |
io.save(f"outputs/out_{index}_aligned.pdb")
|
179 |
# Doing this to get around biopython CEALIGN bug
|
180 |
+
# subprocess.call("pymol -c -Q -r cealign.pml", shell=True)
|
181 |
|
182 |
return aligner.rms, "outputs/reference.pdb", f"outputs/out_{index}_aligned.pdb"
|
183 |
|
|
|
219 |
}
|
220 |
|
221 |
positions = []
|
222 |
+
|
223 |
for r in range(recycles + 1):
|
224 |
outs = RUNNER(x, OPT)
|
225 |
outs = jax.tree_map(lambda x: np.asarray(x), outs)
|
|
|
228 |
plddts.append(outs["plddt"][:LEN])
|
229 |
paes.append(outs["pae"])
|
230 |
if os.path.exists("/home/duerr/phd/08_Code/ProteinMPNN"):
|
231 |
+
save_pdb(
|
232 |
+
outs, f"/home/duerr/phd/08_Code/ProteinMPNN/outputs/out_{i}.pdb", LEN
|
233 |
+
)
|
234 |
else:
|
235 |
save_pdb(outs, f"/home/user/app/outputs/out_{i}.pdb", LEN)
|
236 |
+
return plddts, paes, LEN
|
237 |
|
238 |
|
239 |
if os.path.exists("/home/duerr/phd/08_Code/ProteinMPNN"):
|
|
|
310 |
return None
|
311 |
else:
|
312 |
mol = Molecule(pdb_code)
|
313 |
+
mol.write("original.pdb")
|
314 |
# clean messy files and only include protein itself
|
315 |
mol.filter("protein")
|
316 |
# renumber using moleculekit 0...len(protein)
|
|
|
326 |
mol.write("cleaned.pdb")
|
327 |
return "cleaned.pdb", df
|
328 |
|
329 |
+
|
330 |
def assign_sasa(mol):
|
331 |
from moleculekit.projections.metricsasa import MetricSasa
|
332 |
+
|
333 |
+
metr = MetricSasa(mode="residue", filtersel="protein")
|
|
|
334 |
sasaR = metr.project(mol)[0]
|
335 |
is_prot = mol.atomselect("protein")
|
336 |
+
resids = pd.DataFrame.from_dict({"resid": mol.resid, "is_prot": is_prot})
|
337 |
+
new_masses = []
|
338 |
i_without_non_prot = 0
|
339 |
+
for i, g in resids.groupby((resids["resid"].shift() != resids["resid"]).cumsum()):
|
340 |
+
if g["is_prot"].unique()[0] == True:
|
341 |
+
g["sasa"] = sasaR[i_without_non_prot]
|
342 |
+
i_without_non_prot += 1
|
343 |
else:
|
344 |
+
g["sasa"] = 0
|
345 |
new_masses.extend(list(g.sasa))
|
346 |
return np.array(new_masses)
|
347 |
|
348 |
+
|
349 |
def process_atomsel(atomsel):
|
350 |
"""everything lowercase and replace some keywords not relevant for protein design"""
|
351 |
+
atomsel = re.sub("sasa", "mass", atomsel, flags=re.I)
|
352 |
+
atomsel = re.sub("plddt", "beta", atomsel, flags=re.I)
|
353 |
return atomsel
|
354 |
|
355 |
|
356 |
def make_fixed_positions_dict(atomsel, residue_index_df):
|
357 |
# we use the uploaded file for the selection
|
358 |
+
mol = Molecule("original.pdb")
|
359 |
# use index for selection as resids will change
|
360 |
|
|
|
361 |
# set sasa to 0 for all non protein atoms (all non protein atoms are deleted later)
|
362 |
mol.masses = assign_sasa(mol)
|
363 |
print(mol.masses.shape)
|
364 |
print(assign_sasa(mol).shape)
|
365 |
atomsel = process_atomsel(atomsel)
|
366 |
+
selected_residues = mol.get("index", atomsel)
|
367 |
|
368 |
# clean up
|
369 |
mol.filter("protein")
|
370 |
mol.renumberResidues()
|
371 |
# based on selected index now get resids
|
372 |
selected_residues = [str(i) for i in selected_residues]
|
373 |
+
if len(selected_residues) == 0:
|
374 |
return None, []
|
375 |
selected_residues_str = " ".join(selected_residues)
|
376 |
+
selected_residues = set(mol.get("resid", sel=f"index {selected_residues_str}"))
|
377 |
|
378 |
# use the proteinMPNN index nomenclature to assemble fixed_positions_dict
|
379 |
+
fixed_positions_df = residue_index_df[
|
380 |
+
residue_index_df["new_resid"].isin(selected_residues)
|
381 |
+
]
|
382 |
|
383 |
+
chains = set(mol.get("chain", sel="all"))
|
384 |
+
fixed_position_dict = {"cleaned": {}}
|
385 |
+
# store the selected residues in a list for the visualization later with cleaned.pdb
|
386 |
+
selected_residues = list(fixed_positions_df["new_resid"])
|
387 |
|
388 |
for c in chains:
|
389 |
+
fixed_position_dict["cleaned"][c] = []
|
390 |
|
391 |
for i, row in fixed_positions_df.iterrows():
|
392 |
+
fixed_position_dict["cleaned"][row["chain"]].append(row["proteinMPNN_index"])
|
393 |
return fixed_position_dict, selected_residues
|
394 |
|
395 |
+
|
396 |
def update(
|
397 |
inp,
|
398 |
file,
|
|
|
403 |
sampling_temp,
|
404 |
model_name,
|
405 |
backbone_noise,
|
406 |
+
atomsel,
|
407 |
):
|
408 |
from protein_mpnn_utils import (
|
409 |
loss_nll,
|
|
|
476 |
if atomsel == "":
|
477 |
fixed_positions_dict, selected_residues = None, []
|
478 |
else:
|
479 |
+
fixed_positions_dict, selected_residues = make_fixed_positions_dict(
|
480 |
+
atomsel, mol_index
|
481 |
+
)
|
482 |
+
|
483 |
pssm_dict = None
|
484 |
omit_AA_dict = None
|
485 |
bias_AA_dict = None
|
|
|
496 |
tied_positions_dict = make_tied_positions_for_homomers(pdb_dict_list)
|
497 |
else:
|
498 |
tied_positions_dict = None
|
499 |
+
|
500 |
chain_id_dict = {}
|
501 |
chain_id_dict[pdb_dict_list[0]["name"]] = (designed_chain_list, fixed_chain_list)
|
502 |
with torch.no_grad():
|
|
|
745 |
)
|
746 |
seq_list.append(seq + chain_s)
|
747 |
message += f"{line}\n"
|
748 |
+
if fixed_positions_dict != None:
|
749 |
message += f"\nfixed positions:* {fixed_positions_dict['cleaned']} \n\n*uses CHAIN:[1..len(chain)] residue numbering"
|
750 |
# somehow sequences still contain X, remove again
|
751 |
for i, x in enumerate(seq_list):
|
|
|
776 |
)
|
777 |
|
778 |
fig_tadjusted.update_xaxes(side="top")
|
779 |
+
seq_dict = {"seq_list": seq_list, "recovery": seq_recovery, "seq_score": seq_score}
|
780 |
return (
|
781 |
message,
|
782 |
fig,
|
|
|
786 |
pdb_path,
|
787 |
gr.Dropdown.update(choices=seq_list),
|
788 |
selected_residues,
|
789 |
+
seq_dict,
|
790 |
)
|
791 |
|
792 |
|
793 |
+
def update_AF(seq_dict, pdb, num_recycles, selectedResidues):
|
794 |
|
795 |
# # run alphafold using ray
|
796 |
# plddts, pae, num_res = run_alphafold(
|
797 |
# startsequence, num_recycles
|
798 |
# )
|
799 |
+
allSeqs = seq_dict["seq_list"]
|
800 |
lenSeqs = len(allSeqs)
|
801 |
if len(allSeqs[0]) > 700:
|
802 |
return (
|
|
|
808 |
plt.figure(),
|
809 |
plt.figure(),
|
810 |
)
|
811 |
+
|
812 |
plddts, paes, num_res = ray.get(run_alphafold.remote(allSeqs, num_recycles))
|
813 |
|
814 |
sequences = {}
|
815 |
for i in range(lenSeqs):
|
816 |
+
rms, input_pdb, aligned_pdb = align_structures(
|
817 |
+
pdb, f"outputs/out_{i}.pdb", num_res, i
|
818 |
+
)
|
819 |
+
sequences[i] = {
|
820 |
+
"Seq": i,
|
821 |
+
"RMSD": f"{rms:.2f}",
|
822 |
+
"Score": seq_dict["seq_score"][i],
|
823 |
+
"Recovery": seq_dict["recovery"][i],
|
824 |
+
"Mean pLDDT": f"{np.mean(plddts[i]):.4f}",
|
825 |
+
}
|
826 |
+
results = pd.DataFrame.from_dict(sequences, orient="index")
|
827 |
print(results)
|
828 |
plots = []
|
829 |
for index, plddts_val in enumerate(plddts):
|
|
|
852 |
legend=dict(yanchor="bottom", y=0.01, xanchor="left", x=0.99),
|
853 |
)
|
854 |
pae_plots = []
|
855 |
+
for i, pae in enumerate(paes):
|
856 |
plt.figure()
|
857 |
plt.title(f"Predicted Aligned Error sequence {i}")
|
858 |
Ln = pae.shape[0]
|
|
|
873 |
# plotAF_pae.write_html("test.html")
|
874 |
# plotAF_pae.update_layout(title="Predicted Aligned Error", template="simple_white")
|
875 |
|
876 |
+
return (
|
877 |
+
molecule(
|
878 |
+
input_pdb,
|
879 |
+
aligned_pdb,
|
880 |
+
lenSeqs,
|
881 |
+
num_res,
|
882 |
+
selectedResidues,
|
883 |
+
allSeqs,
|
884 |
+
sequences,
|
885 |
+
),
|
886 |
+
plotAF_plddt,
|
887 |
+
pae_plots,
|
888 |
+
results,
|
889 |
+
)
|
890 |
|
891 |
|
892 |
def read_mol(molpath):
|
|
|
898 |
return mol
|
899 |
|
900 |
|
901 |
+
def molecule(
|
902 |
+
input_pdb, aligned_pdb, lenSeqs, num_res, selectedResidues, allSeqs, sequences
|
903 |
+
):
|
904 |
|
905 |
mol = read_mol("outputs/reference.pdb")
|
906 |
+
options = ""
|
907 |
pred_mol = "["
|
908 |
seqdata = "{"
|
909 |
selected = "selected"
|
910 |
for i in range(lenSeqs):
|
911 |
+
seqdata += (
|
912 |
+
str(i)
|
913 |
+
+ ': { "score": '
|
914 |
+
+ sequences[i]["Score"]
|
915 |
+
+ ', "rmsd": '
|
916 |
+
+ sequences[i]["RMSD"]
|
917 |
+
+ ', "recovery": '
|
918 |
+
+ sequences[i]["Recovery"]
|
919 |
+
+ ', "plddt": '
|
920 |
+
+ sequences[i]["Mean pLDDT"]
|
921 |
+
+ ', "seq":"'
|
922 |
+
+ allSeqs[i]
|
923 |
+
+ '"}'
|
924 |
+
)
|
925 |
+
options += f'<option {selected} value="{i}">sequence {i} </option>' # RMSD {sequences[i]["RMSD"]}, score {sequences[i]["Score"]}, recovery {sequences[i]["Recovery"]} pLDDT {sequences[i]["Mean pLDDT"]}
|
926 |
+
p = f"outputs/out_{i}_aligned.pdb"
|
927 |
+
pred_mol += f"`{read_mol(p)}`"
|
928 |
selected = ""
|
929 |
+
if i != lenSeqs - 1:
|
930 |
+
pred_mol += ","
|
931 |
+
seqdata += ","
|
932 |
+
pred_mol += "]"
|
933 |
+
seqdata += "}"
|
|
|
|
|
934 |
|
935 |
x = (
|
936 |
"""<!DOCTYPE html>
|
|
|
975 |
a sequence</label>
|
976 |
<select id="seq"
|
977 |
class="bg-gray-50 border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2.5 dark:bg-gray-700 dark:border-gray-600 dark:placeholder-gray-400 dark:text-white dark:focus:ring-blue-500 dark:focus:border-blue-500">
|
978 |
+
"""
|
979 |
+
+ options
|
980 |
+
+ """
|
981 |
</select>
|
982 |
</div>
|
983 |
<div class="font-mono bg-gray-100 py-3 px-2 font-sm rounded">
|
|
|
1076 |
let element = null;
|
1077 |
let config = null;
|
1078 |
let currentIndex = 0;
|
1079 |
+
let seqs = """
|
1080 |
+
+ seqdata
|
1081 |
+
+ """
|
1082 |
let data = """
|
1083 |
+ pred_mol
|
1084 |
+ """
|
1085 |
let pdb = `"""
|
1086 |
+ mol
|
1087 |
+ """`
|
1088 |
+
var selectedResidues = """
|
1089 |
+
+ f"{selectedResidues}"
|
1090 |
+
+ """
|
1091 |
//AlphaFold code from https://gist.github.com/piroyon/30d1c1099ad488a7952c3b21a5bebc96
|
1092 |
let colorAlpha = function (atom) {
|
1093 |
if (atom.b < 50) {
|
|
|
1102 |
};
|
1103 |
|
1104 |
let colors = {}
|
1105 |
+
for (let i=0; i<"""
|
1106 |
+
+ str(num_res)
|
1107 |
+
+ """;i++){
|
1108 |
if (selectedResidues.includes(i)){
|
1109 |
colors[i]="hotpink"
|
1110 |
}else{
|
|
|
1199 |
|
1200 |
|
1201 |
def set_examples(example):
|
1202 |
+
(
|
1203 |
+
label,
|
1204 |
+
inp,
|
1205 |
+
designed_chain,
|
1206 |
+
fixed_chain,
|
1207 |
+
homomer,
|
1208 |
+
num_seqs,
|
1209 |
+
sampling_temp,
|
1210 |
+
atomsel,
|
1211 |
+
) = example
|
1212 |
return [
|
1213 |
label,
|
1214 |
inp,
|
|
|
1217 |
homomer,
|
1218 |
gr.Slider.update(value=num_seqs),
|
1219 |
gr.Radio.update(value=sampling_temp),
|
1220 |
+
atomsel,
|
1221 |
]
|
1222 |
|
1223 |
|
|
|
1254 |
value=0.1,
|
1255 |
label="Sampling temperature",
|
1256 |
)
|
1257 |
+
gr.Markdown(
|
1258 |
+
""" Sampling temperature for amino acids, `T=0.0` means taking argmax, `T>>1.0` means sample randomly. Suggested values `0.1, 0.15, 0.2, 0.25, 0.3`. Higher values will lead to more diversity.
|
1259 |
"""
|
1260 |
)
|
1261 |
with gr.Row():
|
|
|
1277 |
gr.Markdown(
|
1278 |
"for correct symmetric tying lenghts of homomer chains should be the same"
|
1279 |
)
|
1280 |
+
gr.Markdown("## Fixed positions")
|
1281 |
+
gr.Markdown(
|
1282 |
+
"""You can fix important positions in the protein. Resid should be specified with the same numbering as in the input pdb file. The fixed residues will be highlighted in the output.
|
1283 |
The [VMD selection](http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html) synthax is used. You can also select based on ligands or chains in the input structure to specify interfaces to be fixed.
|
1284 |
|
1285 |
- <code>within 5 of resid 94</code> All residues that have >1 atom closer than 5 Å to any atom of residue 94
|
|
|
1293 |
- <code>pLDDT >70 </code> Redesign all residues with low pLDDT
|
1294 |
|
1295 |
Note that <code>sasa</code> and <code>pLDDT</code> selectors modify default VMD behavior. SASA is calculated using moleculekit and written to the mass attribute. Selections based on mass do not work.
|
1296 |
+
pLDDT is an alias for beta, it only works correctly with structures that contain the appropriate values in the beta column of the PDB file. """
|
1297 |
+
)
|
1298 |
+
atomsel = gr.Textbox(
|
1299 |
+
placeholder="Specify atom selection ", label="Fixed positions"
|
1300 |
+
)
|
1301 |
|
1302 |
btn = gr.Button("Run")
|
1303 |
label = gr.Textbox(label="Label", visible=False)
|
|
|
1310 |
homomer,
|
1311 |
num_seqs,
|
1312 |
sampling_temp,
|
1313 |
+
atomsel,
|
1314 |
],
|
1315 |
samples=[
|
1316 |
+
["Homomer design", "1O91", "A,B,C", "", True, 2, 0.1, ""],
|
1317 |
+
["Monomer design", "6MRR", "A", "", False, 2, 0.1, ""],
|
1318 |
["Redesign of Homomer to Heteromer", "3HTN", "A,B", "C", False, 2, 0.1, ""],
|
1319 |
+
[
|
1320 |
+
"Redesign of MID1 scaffold keeping binding site fixed",
|
1321 |
+
"3V1C",
|
1322 |
+
"A,B",
|
1323 |
+
"",
|
1324 |
+
False,
|
1325 |
+
2,
|
1326 |
+
0.1,
|
1327 |
+
"within 5 of resname ZN",
|
1328 |
+
],
|
1329 |
+
[
|
1330 |
+
"Redesign of DNA binding protein",
|
1331 |
+
"3JRD",
|
1332 |
+
"A,B",
|
1333 |
+
"",
|
1334 |
+
False,
|
1335 |
+
2,
|
1336 |
+
0.1,
|
1337 |
+
"within 8 of nucleic",
|
1338 |
+
],
|
1339 |
+
[
|
1340 |
+
"Surface Redesign of miniprotein",
|
1341 |
+
"7JZM",
|
1342 |
+
"A,B",
|
1343 |
+
"",
|
1344 |
+
False,
|
1345 |
+
2,
|
1346 |
+
0.1,
|
1347 |
+
"chain B or (chain A and sasa < 20)",
|
1348 |
+
],
|
1349 |
],
|
1350 |
)
|
1351 |
|
|
|
1352 |
gr.Markdown("# Output")
|
1353 |
|
1354 |
with gr.Tabs():
|
|
|
1383 |
with gr.Row():
|
1384 |
with gr.Row():
|
1385 |
chosen_seq = gr.Dropdown(
|
1386 |
+
choices=[],
|
1387 |
+
label="Select a sequence for validation",
|
1388 |
+
visible=False,
|
1389 |
)
|
1390 |
num_recycles = gr.Dropdown(
|
1391 |
choices=[0, 1, 3, 5], value=3, label="num Recycles"
|
|
|
1395 |
mol = gr.HTML()
|
1396 |
with gr.Column():
|
1397 |
gr.Markdown("## Metrics")
|
1398 |
+
p = {
|
1399 |
+
0: {
|
1400 |
+
"Seq": "NA",
|
1401 |
+
"RMSD": "NA",
|
1402 |
+
"Score": "NA",
|
1403 |
+
"Recovery": "NA",
|
1404 |
+
"Mean pLDDT": "NA",
|
1405 |
+
}
|
1406 |
+
}
|
1407 |
+
placeholder = pd.DataFrame.from_dict(p, orient="index")
|
1408 |
+
results = gr.Dataframe(
|
1409 |
+
placeholder,
|
1410 |
+
interactive=False,
|
1411 |
+
row_count=(1, "dynamic"),
|
1412 |
+
headers=["Seq", "RMSD", "Score", "Recovery", "Mean pLDDT"],
|
1413 |
+
)
|
1414 |
plotAF_plddt = gr.Plot(label="pLDDT")
|
1415 |
# remove maxh80 class from css
|
1416 |
+
plotAF_pae = gr.Gallery(label="PAE plots") # gr.Plot(label="PAE")
|
1417 |
tempFile = gr.Variable()
|
1418 |
selectedResidues = gr.Variable()
|
1419 |
seq_dict = gr.Variable()
|
|
|
1440 |
tempFile,
|
1441 |
chosen_seq,
|
1442 |
selectedResidues,
|
1443 |
+
seq_dict,
|
1444 |
],
|
1445 |
)
|
1446 |
btnAF.click(
|
requirements.txt
CHANGED
@@ -6,8 +6,8 @@ dm-haiku==0.0.5
|
|
6 |
dm-tree==0.1.6
|
7 |
docker==5.0.0
|
8 |
immutabledict==2.0.0
|
9 |
-
|
10 |
-
|
11 |
ml-collections==0.1.0
|
12 |
numpy
|
13 |
pandas==1.3.4
|
@@ -18,7 +18,7 @@ plotly
|
|
18 |
GPUtil
|
19 |
ray
|
20 |
tqdm
|
21 |
-
gradio==3.
|
22 |
protobuf<4
|
23 |
mdtraj
|
24 |
-f https://storage.googleapis.com/jax-releases/jax_releases.html
|
|
|
6 |
dm-tree==0.1.6
|
7 |
docker==5.0.0
|
8 |
immutabledict==2.0.0
|
9 |
+
jax==0.3.7
|
10 |
+
jaxlib==0.3.7
|
11 |
ml-collections==0.1.0
|
12 |
numpy
|
13 |
pandas==1.3.4
|
|
|
18 |
GPUtil
|
19 |
ray
|
20 |
tqdm
|
21 |
+
gradio==3.1.4
|
22 |
protobuf<4
|
23 |
mdtraj
|
24 |
-f https://storage.googleapis.com/jax-releases/jax_releases.html
|