File size: 6,728 Bytes
76f2fb3
02970c0
db541e4
02970c0
4d5beeb
f73076c
4d5beeb
ed36f4f
02970c0
db541e4
6bc4f0e
7f5c48e
 
 
d0c2974
7f5c48e
 
 
 
d0c2974
7f5c48e
6bc4f0e
7f5c48e
6bc4f0e
7f5c48e
 
 
76f2fb3
d0c2974
7f5c48e
6bc4f0e
ed36f4f
6bc4f0e
ed36f4f
 
 
d0c2974
ed36f4f
 
 
 
 
 
 
 
6bc4f0e
ed36f4f
6bc4f0e
ed36f4f
 
 
4455bd5
6b700d7
d14afa6
c7cc63a
 
ed36f4f
 
 
 
 
 
 
 
6bc4f0e
02970c0
4d5beeb
 
 
6bc4f0e
 
4d5beeb
76f2fb3
4d5beeb
 
 
 
 
 
 
6bc4f0e
4d5beeb
ff4e1a8
6bc4f0e
4d5beeb
76f2fb3
d92a3e6
4d5beeb
 
 
affd796
ed36f4f
 
 
76f2fb3
 
ff4e1a8
 
6bc4f0e
ed36f4f
76f2fb3
ed36f4f
 
 
 
 
 
 
ff4e1a8
 
 
 
6bc4f0e
ed36f4f
76f2fb3
ed36f4f
 
 
affd796
 
2f14da2
 
 
 
f73076c
 
 
 
ed36f4f
2f14da2
 
affd796
4f972b8
 
 
 
 
 
 
 
 
 
 
 
4d5beeb
 
db541e4
affd796
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import re, os
from pathlib import Path
import gradio as gr

from evodiff.pretrained import OA_DM_38M, D3PM_UNIFORM_38M, MSA_OA_DM_MAXSUB
from evodiff.generate import generate_oaardm, generate_d3pm
from evodiff.generate_msa import generate_query_oadm_msa_simple
from evodiff.conditional_generation import inpaint_simple, generate_scaffold


def make_uncond_seq(seq_len, model_type):
    if model_type == "EvoDiff-Seq-OADM 38M":
        checkpoint = OA_DM_38M()
        model, collater, tokenizer, scheme = checkpoint
        tokeinzed_sample, generated_sequence = generate_oaardm(model, tokenizer, int(seq_len), batch_size=1, device='cpu')
    
    if model_type == "EvoDiff-D3PM-Uniform 38M":
        checkpoint = D3PM_UNIFORM_38M(return_all=True)
        model, collater, tokenizer, scheme, timestep, Q_bar, Q = checkpoint
        tokeinzed_sample, generated_sequence = generate_d3pm(model, tokenizer, Q, Q_bar, timestep, int(seq_len), batch_size=1, device='cpu')

    return generated_sequence

def make_cond_seq(seq_len, msa_file, n_sequences, model_type):
    if model_type == "EvoDiff-MSA":
        checkpoint = MSA_OA_DM_MAXSUB()
        model, collater, tokenizer, scheme = checkpoint
        print(f"MSA File Path: {msa_file.name}")
        tokeinzed_sample, generated_sequence  = generate_query_oadm_msa_simple(msa_file.name, model, tokenizer, int(n_sequences), seq_length=int(seq_len), device='cpu', selection_type='random')

    return generated_sequence
    
def make_inpainted_idrs(sequence, start_idx, end_idx, model_type):
    if model_type == "EvoDiff-Seq":
        checkpoint = OA_DM_38M()
        model, collater, tokenizer, scheme = checkpoint
        sample, entire_sequence, generated_idr = inpaint_simple(model, sequence, int(start_idx), int(end_idx), tokenizer=tokenizer, device='cpu')

        generated_idr_output = {
            "original_sequence": sequence,
            "generated_sequence": entire_sequence,
            "original_region": sequence[start_idx:end_idx],
            "generated_region": generated_idr
        }

    return generated_idr_output
    
def make_scaffold_motifs(pdb_code, start_idx, end_idx, scaffold_length, model_type):
    if model_type == "EvoDiff-Seq":
        checkpoint = OA_DM_38M()
        model, collater, tokenizer, scheme = checkpoint
        data_top_dir = '/home/user/.cache/huggingface/datasets/'
        os.makedirs(data_top_dir, exist_ok=True)
        # print("Folders in User Cache Directory:", os.listdir("/home/user/.cache"))
        start_idx = list(map(int, start_idx.strip('][').split(',')))
        end_idx = list(map(int, end_idx.strip('][').split(',')))
        generated_sequence, new_start_idx, new_end_idx = generate_scaffold(model, pdb_code, start_idx, end_idx, scaffold_length, data_top_dir, tokenizer, device='cpu')

        generated_scaffold_output = {
            "generated_sequence": generated_sequence,
            "new_start_index": new_start_idx,
            "new_end_index": new_end_idx
        }

    return generated_scaffold_output

usg_app = gr.Interface(
            fn=make_uncond_seq,
            inputs=[
                gr.Slider(10, 250, step=1, label = "Sequence Length"),
                gr.Dropdown(["EvoDiff-Seq-OADM 38M", "EvoDiff-D3PM-Uniform 38M"], value="EvoDiff-Seq-OADM 38M", type="value", label = "Model")
                ],
            outputs=["text"],
            title = "Unconditional sequence generation",
            description="Generate a sequence with `EvoDiff-Seq-OADM 38M` (smaller/faster) or `EvoDiff-D3PM-Uniform 38M` (larger/slower) models."
            )

csg_app = gr.Interface(
            fn=make_cond_seq,
            inputs=[
                gr.Slider(10, 250, label = "Sequence Length"),
                gr.File(file_types=["a3m"], label = "MSA File"),
                gr.Number(value=64, precision=0, label = "Number of Sequences to Sample"),
                gr.Dropdown(["EvoDiff-MSA"], value="EvoDiff-MSA", type="value", label = "Model")
                ],
            outputs=["text"],
            # examples=[["https://github.com/microsoft/evodiff/raw/main/examples/example_files/bfd_uniclust_hits.a3m"]], 
            title = "Conditional sequence generation",
            description="Evolutionary guided sequence generation with the `EvoDiff-MSA` model."
            )

idr_app = gr.Interface(
            fn=make_inpainted_idrs,
            inputs=[
                gr.Textbox(value = "DQTERTVRSFEGRRTAPYLDSRNVLTIGYGHLLNRPGANKSWEGRLTSALPREFKQRLTELAASQLHETDVRLATARAQALYGSGAYFESVPVSLNDLWFDSVFNLGERKLLNWSGLRTKLESRDWGAAAKDLGRHTFGREPVSRRMAESMRMRRGIDLNHYNI",
                           label = "Sequence"),
                gr.Number(value=20, precision=0, label = "Start Index"),
                gr.Number(value=50, precision=0, label = "End Index"),
                gr.Dropdown(["EvoDiff-Seq"], value="EvoDiff-Seq", type="value", label = "Model")
                ],
            outputs=["text"],
            title = "Inpainting IDRs",
            description="Inpaining a new region inside a given sequence using the `EvoDiff-Seq` model."
            )

scaffold_app = gr.Interface(
            fn=make_scaffold_motifs,
            inputs=[
                gr.Textbox(value="1prw", label = "PDB Code"),
                gr.Textbox(value="[15, 51]", label = "Start Index (as list)"),
                gr.Textbox(value="[34, 70]", label = "End Index (as list)"),
                gr.Number(value=75, precision=0, label = "Scaffold Length"),
                gr.Dropdown(["EvoDiff-Seq", "EvoDiff-MSA"], value="EvoDiff-Seq", type="value", label = "Model")
                ],
            outputs=["text"],
            title = "Scaffolding functional motifs",
            description="Scaffolding a new functional motif inside a given PDB structure using the `EvoDiff-Seq` model."
            )

with gr.Blocks() as edapp:
    with gr.Row():
        gr.Markdown(
            """
            # EvoDiff
            ## Generation of protein sequences and evolutionary alignments via discrete diffusion models

            Created By: Microsoft Research [Sarah Alamdari, Nitya Thakkar, Rianne van den Berg, Alex X. Lu, Nicolo Fusi, ProfileAva P. Amini, and Kevin K. Yang]
            
            Spaces App By: Tuple, The Cloud Genomics Company [Colby T. Ford]
            """
        )
    with gr.Row():
        gr.TabbedInterface([
            usg_app,
            csg_app,
            idr_app#,
            # scaffold_app
            ],
            [
                "Unconditional sequence generation",
                "Conditional generation",
                "Inpainting IDRs"#,
                # "Scaffolding functional motifs"
            ])



if __name__ == "__main__":
    edapp.launch()