File size: 2,788 Bytes
75c9fa7
9d0e799
1ae49ba
 
 
 
 
 
9d0e799
1ae49ba
 
 
 
 
 
 
 
75c9fa7
 
1ae49ba
9d0e799
1ae49ba
9d0e799
 
 
 
1ae49ba
9d0e799
1ae49ba
 
 
75c9fa7
1ae49ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75c9fa7
9d0e799
1ae49ba
 
 
 
 
 
 
 
 
 
 
 
 
4f07e20
1ae49ba
 
 
 
 
 
75c9fa7
1ae49ba
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import streamlit as st
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

st.set_page_config(page_title="NeoProtein Designer", page_icon="🧬")
st.title("🧬 NeoProtein-GPT Protein Designer")
st.markdown("""
### Design novel protein sequences with unique binding sites
*Using the [NeoProtein-GPT](https://huggingface.co/ayyuce/NeoProtein-GPT) model*
""")

with st.sidebar:
    st.header("Parameters")
    binding_motif = st.text_input("Binding site motif (e.g., AXXC):", help="Use X for wildcard positions")
    seq_length = st.slider("Sequence length", 50, 500, 150)
    temperature = st.slider("Temperature (creativity)", 0.1, 2.0, 1.0)
    num_sequences = st.slider("Number of sequences", 1, 5, 3)

@st.cache_resource(show_spinner=False)
def load_model():
    model = GPT2LMHeadModel.from_pretrained(
        "ayyuce/NeoProtein-GPT",
        force_download=True,
        resume_download=False,
        local_files_only=False,
        trust_remote_code=True
    )
    tokenizer = GPT2Tokenizer.from_pretrained("ayyuce/NeoProtein-GPT")
    return model, tokenizer

model, tokenizer = load_model()

def generate_sequences():
    if not binding_motif:
        st.error("Please enter a binding motif")
        return
    
    prompt = f"<start>BindingMotif:{binding_motif}<start>Seq:"
    
    try:
        inputs = tokenizer(prompt, return_tensors="pt")
        input_length = inputs.input_ids.shape[1]
        
        outputs = model.generate(
            inputs.input_ids,
            max_length=input_length + seq_length,
            temperature=temperature,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            num_return_sequences=num_sequences,
            pad_token_id=tokenizer.eos_token_id
        )
        
        generated_sequences = [
            tokenizer.decode(output[input_length:], skip_special_tokens=True)
            for output in outputs
        ]
        
        return generated_sequences
    
    except Exception as e:
        st.error(f"Generation failed: {str(e)}")
        return []


if st.button("Generate Protein Sequences"):
    with st.spinner("Designing novel proteins..."):
        sequences = generate_sequences()
        
    if sequences:
        st.subheader("Generated Sequences")
        for i, seq in enumerate(sequences):
            st.markdown(f"""
            **Sequence #{i+1}**
            ```fasta
            {seq}
            ```
            """)

st.markdown("""
### How to use:
1. Enter your target binding motif using single-letter amino acid codes
2. Adjust parameters in the sidebar
3. Click the generate button
4. Results will appear in FASTA format

**Example motifs:**
- `GHXXXH` for histidine-rich motifs
- `CXXC` for disulfide bond motifs
- `DE` for acidic patches
""")