File size: 10,536 Bytes
02970c0
 
db541e4
02970c0
4d5beeb
f73076c
4d5beeb
ed36f4f
02970c0
2ec65d5
8505e9d
 
db541e4
4d5beeb
 
c72f5fe
7f5c48e
 
 
8505e9d
7f5c48e
 
 
 
 
 
 
 
 
 
 
 
 
 
8505e9d
7f5c48e
4d5beeb
7f5c48e
affd796
2ec65d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2466cb5
7f5c48e
 
 
d0c2974
7f5c48e
 
 
 
d0c2974
7f5c48e
2466cb5
 
 
7f5c48e
2466cb5
 
c26a162
7f5c48e
d0c2974
7f5c48e
 
 
d0c2974
7f5c48e
2466cb5
 
 
7f5c48e
2466cb5
 
c26a162
ed36f4f
 
 
 
 
d0c2974
ed36f4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0c2974
 
ed36f4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02970c0
4d5beeb
 
 
1082445
b6b61f6
a046ca2
4d5beeb
7f5c48e
 
 
 
4d5beeb
 
 
 
 
 
 
 
 
d0c2974
2466cb5
a046ca2
4d5beeb
7f5c48e
 
 
 
d92a3e6
4d5beeb
 
 
affd796
ed36f4f
 
 
 
d0c2974
 
ed36f4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0c2974
ed36f4f
 
 
 
 
 
 
 
 
 
affd796
 
2f14da2
 
 
 
f73076c
 
 
 
ed36f4f
2f14da2
 
affd796
ed36f4f
 
 
 
 
4d5beeb
 
db541e4
affd796
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import re
from pathlib import Path
import gradio as gr

from evodiff.pretrained import OA_DM_38M, D3PM_UNIFORM_38M, MSA_OA_DM_MAXSUB
from evodiff.generate import generate_oaardm, generate_d3pm
from evodiff.generate_msa import generate_query_oadm_msa_simple
from evodiff.conditional_generation import inpaint_simple, generate_scaffold

import py3Dmol
from colabfold.download import download_alphafold_params
from colabfold.batch import run

def a3m_file(file):
    return "tmp.a3m"

def predict_protein(sequence):
    download_alphafold_params("alphafold2_ptm", Path("."))
    results = run(
        queries=[('evodiff_protein', sequence, None)],
        result_dir='evodiff_protein',
        use_templates=False,
        num_relax=0,
        msa_mode="mmseqs2_uniref_env",
        model_type="alphafold2_ptm",
        num_models=1,
        num_recycles=1,
        model_order=[1],
        is_complex=False,
        data_dir=Path("."),
        keep_existing_results=False,
        rank_by="auto",
        stop_at_score=float(100),
        zip_results=False,
        user_agent="colabfold/google-colab-main"
    )

    return f"evodiff_protein/evodiff_protein_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000.pdb"

def display_pdb(path_to_pdb):
    '''
        #function to display pdb in py3dmol
        SOURCE: https://huggingface.co/spaces/merle/PROTEIN_GENERATOR/blob/main/app.py
    '''
    pdb = open(path_to_pdb, "r").read()
    
    view = py3Dmol.view(width=500, height=500)
    view.addModel(pdb, "pdb")
    view.setStyle({'model': -1}, {"cartoon": {'colorscheme':{'prop':'b','gradient':'roygb','min':0,'max':1}}})#'linear', 'min': 0, 'max': 1, 'colors': ["#ff9ef0","#a903fc",]}}}) 
    view.zoomTo()
    output = view._make_html().replace("'", '"')
    print(view._make_html())
    x = f"""<!DOCTYPE html><html></center> {output} </center></html>"""  # do not use ' in this input
    
    return f"""<iframe height="500px" width="100%"  name="result" allow="midi; geolocation; microphone; camera;
                            display-capture; encrypted-media;" sandbox="allow-modals allow-forms
                            allow-scripts allow-same-origin allow-popups
                            allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
                            allowpaymentrequest="" frameborder="0" srcdoc='{x}'></iframe>"""

'''
    return f"""<iframe  style="width: 100%; height:700px" name="result" allow="midi; geolocation; microphone; camera; 
                            display-capture; encrypted-media;" sandbox="allow-modals allow-forms 
                            allow-scripts allow-same-origin allow-popups 
                            allow-top-navigation-by-user-activation allow-downloads" allowfullscreen="" 
                            allowpaymentrequest="" frameborder="0" srcdoc='{x}'></iframe>"""
'''

def make_uncond_seq(seq_len, model_type, pred_structure):
    if model_type == "EvoDiff-Seq-OADM 38M":
        checkpoint = OA_DM_38M()
        model, collater, tokenizer, scheme = checkpoint
        tokeinzed_sample, generated_sequence = generate_oaardm(model, tokenizer, int(seq_len), batch_size=1, device='cpu')
    
    if model_type == "EvoDiff-D3PM-Uniform 38M":
        checkpoint = D3PM_UNIFORM_38M(return_all=True)
        model, collater, tokenizer, scheme, timestep, Q_bar, Q = checkpoint
        tokeinzed_sample, generated_sequence = generate_d3pm(model, tokenizer, Q, Q_bar, timestep, int(seq_len), batch_size=1, device='cpu')

    if pred_structure:
        path_to_pdb = predict_protein(generated_sequence)
        molhtml = display_pdb(path_to_pdb)

        return generated_sequence, molhtml
    else:
        return generated_sequence, None

def make_cond_seq(seq_len, msa_file, n_sequences, model_type, pred_structure):
    if model_type == "EvoDiff-MSA":
        checkpoint = MSA_OA_DM_MAXSUB()
        model, collater, tokenizer, scheme = checkpoint
        tokeinzed_sample, generated_sequence  = generate_query_oadm_msa_simple(msa_file.name, model, tokenizer, int(n_sequences), seq_length=int(seq_len), device='cpu', selection_type='random')

    if pred_structure:
        path_to_pdb = predict_protein(generated_sequence)
        molhtml = display_pdb(path_to_pdb)

        return generated_sequence, molhtml
    else:
        return generated_sequence, None
    
def make_inpainted_idrs(sequence, start_idx, end_idx, model_type, pred_structure):
    if model_type == "EvoDiff-Seq":
        checkpoint = OA_DM_38M()
        model, collater, tokenizer, scheme = checkpoint
        sample, entire_sequence, generated_idr = inpaint_simple(model, sequence, int(start_idx), int(end_idx), tokenizer=tokenizer, device='cpu')

        generated_idr_output = {
            "original_sequence": sequence,
            "generated_sequence": entire_sequence,
            "original_region": sequence[start_idx:end_idx],
            "generated_region": generated_idr
        }

    if pred_structure:
        path_to_pdb = predict_protein(entire_sequence)
        molhtml = display_pdb(path_to_pdb)

        return generated_idr_output, molhtml
    else:
        return generated_idr_output, None
    
def make_scaffold_motifs(pdb_code, start_idx, end_idx, scaffold_length, model_type, pred_structure):
    if model_type == "EvoDiff-Seq":
        checkpoint = OA_DM_38M()
        model, collater, tokenizer, scheme = checkpoint
        data_top_dir = './'
        start_idx = list(map(int, start_idx.strip('][').split(', ')))
        end_idx = list(map(int, end_idx.strip('][').split(', ')))
        generated_sequence, new_start_idx, new_end_idx = generate_scaffold(model, pdb_code, start_idx, end_idx, scaffold_length, data_top_dir, tokenizer, device='cpu')

        generated_scaffold_output = {
            "generated_sequence": generated_sequence,
            "new_start_index": new_start_idx,
            "new_end_index": new_end_idx
        }

    if pred_structure:
        # path_to_pdb = predict_protein(generated_sequence)
        path_to_pdb = f"scaffolding-pdbs/{pdb_code}.pdb"
        molhtml = display_pdb(path_to_pdb)

        return generated_scaffold_output, molhtml
    else:
        return generated_scaffold_output, None

usg_app = gr.Interface(
            fn=make_uncond_seq,
            inputs=[
                gr.Slider(10, 100, step=1, label = "Sequence Length"),
                gr.Dropdown(["EvoDiff-Seq-OADM 38M", "EvoDiff-D3PM-Uniform 38M"], value="EvoDiff-Seq-OADM 38M", type="value", label = "Model"),
                gr.Checkbox(value=False, label = "Predict Structure?", visible=False)
                ],
            outputs=[
                "text",
                gr.HTML()
            ],
            title = "Unconditional sequence generation",
            description="Generate a sequence with `EvoDiff-Seq-OADM 38M` (smaller/faster) or `EvoDiff-D3PM-Uniform 38M` (larger/slower) models."
            )

csg_app = gr.Interface(
            fn=make_cond_seq,
            inputs=[
                gr.Slider(10, 100, label = "Sequence Length"),
                gr.File(file_types=["a3m"], label = "MSA File"),
                gr.Number(value=1, placeholder=1, precision=0, label = "Number of Sequences")
                gr.Dropdown(["EvoDiff-MSA"], value="EvoDiff-MSA", type="value", label = "Model"),
                gr.Checkbox(value=False, label = "Predict Structure?", visible=False)
                ],
            outputs=[
                "text",
                gr.HTML()
            ],
            # examples=[["https://github.com/microsoft/evodiff/raw/main/examples/example_files/bfd_uniclust_hits.a3m"]], 
            title = "Conditional sequence generation",
            description="Evolutionary guided sequence generation with the `EvoDiff-MSA` model."
            )

idr_app = gr.Interface(
            fn=make_inpainted_idrs,
            inputs=[
                gr.Textbox(placeholder="DQTERTVRSFEGRRTAPYLDSRNVLTIGYGHLLNRPGANKSWEGRLTSALPREFKQRLTELAASQLHETDVRLATARAQALYGSGAYFESVPVSLNDLWFDSVFNLGERKLLNWSGLRTKLESRDWGAAAKDLGRHTFGREPVSRRMAESMRMRRGIDLNHYNI", label = "Sequence"),
                gr.Number(value=20, placeholder=20, precision=0, label = "Start Index"),
                gr.Number(value=50, placeholder=50, precision=0, label = "End Index"),
                gr.Dropdown(["EvoDiff-Seq"], value="EvoDiff-Seq", type="value", label = "Model"),
                gr.Checkbox(value=False, label = "Predict Structure?", visible=False)
                ],
            outputs=[
                "text",
                gr.HTML()
            ],
            title = "Inpainting IDRs",
            description="Inpaining a new region inside a given sequence using the `EvoDiff-Seq` model."
            )

scaffold_app = gr.Interface(
            fn=make_scaffold_motifs,
            inputs=[
                gr.Textbox(placeholder="1prw", label = "PDB Code"),
                gr.Textbox(value="[15, 51]", placeholder="[15, 51]", label = "Start Index (as list)"),
                gr.Textbox(value="[34, 70]", placeholder="[34, 70]", label = "End Index (as list)"),
                gr.Number(value=75, placeholder=75, precision=0, label = "Scaffold Length"),
                gr.Dropdown(["EvoDiff-Seq", "EvoDiff-MSA"], value="EvoDiff-Seq", type="value", label = "Model"),
                gr.Checkbox(value=False, label = "Predict Structure?", visible=False)
                ],
            outputs=[
                "text",
                gr.HTML()
            ],
            title = "Scaffolding functional motifs",
            description="Scaffolding a new functional motif inside a given PDB structure using the `EvoDiff-Seq` model."
            )

with gr.Blocks() as edapp:
    with gr.Row():
        gr.Markdown(
            """
            # EvoDiff
            ## Generation of protein sequences and evolutionary alignments via discrete diffusion models

            Created By: Microsoft Research [Sarah Alamdari, Nitya Thakkar, Rianne van den Berg, Alex X. Lu, Nicolo Fusi, ProfileAva P. Amini, and Kevin K. Yang]
            
            Spaces App By: Tuple, The Cloud Genomics Company [Colby T. Ford]
            """
        )
    with gr.Row():
        gr.TabbedInterface([usg_app, csg_app, idr_app, scaffold_app],
                           ["Unconditional sequence generation",
                            "Conditional generation",
                            "Inpainting IDRs",
                            "Scaffolding functional motifs"])



if __name__ == "__main__":
    edapp.launch()