TianlaiChen commited on
Commit
4ed0ba3
1 Parent(s): 9b1cba9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -92
app.py CHANGED
@@ -1,93 +1,105 @@
1
- import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForMaskedLM
3
- import torch
4
- from torch.distributions.categorical import Categorical
5
- import numpy as np
6
- import pandas as pd
7
-
8
- # Load the model and tokenizer
9
- tokenizer = AutoTokenizer.from_pretrained("TianlaiChen/PepMLM-650M")
10
- model = AutoModelForMaskedLM.from_pretrained("TianlaiChen/PepMLM-650M")
11
-
12
- def compute_pseudo_perplexity(model, tokenizer, protein_seq, binder_seq):
13
- sequence = protein_seq + binder_seq
14
- tensor_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
15
-
16
- # Create a mask for the binder sequence
17
- binder_mask = torch.zeros(tensor_input.shape).to(model.device)
18
- binder_mask[0, -len(binder_seq)-1:-1] = 1
19
-
20
- # Mask the binder sequence in the input and create labels
21
- masked_input = tensor_input.clone().masked_fill_(binder_mask.bool(), tokenizer.mask_token_id)
22
- labels = tensor_input.clone().masked_fill_(~binder_mask.bool(), -100)
23
-
24
- with torch.no_grad():
25
- loss = model(masked_input, labels=labels).loss
26
- return np.exp(loss.item())
27
-
28
-
29
- def generate_peptide(protein_seq, peptide_length, top_k, num_binders):
30
-
31
- peptide_length = int(peptide_length)
32
- top_k = int(top_k)
33
- num_binders = int(num_binders)
34
-
35
- binders_with_ppl = []
36
-
37
- for _ in range(num_binders):
38
- # Generate binder
39
- masked_peptide = '<mask>' * peptide_length
40
- input_sequence = protein_seq + masked_peptide
41
- inputs = tokenizer(input_sequence, return_tensors="pt").to(model.device)
42
-
43
- with torch.no_grad():
44
- logits = model(**inputs).logits
45
- mask_token_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
46
- logits_at_masks = logits[0, mask_token_indices]
47
-
48
- # Apply top-k sampling
49
- top_k_logits, top_k_indices = logits_at_masks.topk(top_k, dim=-1)
50
- probabilities = torch.nn.functional.softmax(top_k_logits, dim=-1)
51
- predicted_indices = Categorical(probabilities).sample()
52
- predicted_token_ids = top_k_indices.gather(-1, predicted_indices.unsqueeze(-1)).squeeze(-1)
53
-
54
- generated_binder = tokenizer.decode(predicted_token_ids, skip_special_tokens=True).replace(' ', '')
55
-
56
- # Compute PPL for the generated binder
57
- ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
58
-
59
- # Add the generated binder and its PPL to the results list
60
- binders_with_ppl.append([generated_binder, ppl_value])
61
-
62
- # Convert the list of lists to a pandas dataframe
63
- df = pd.DataFrame(binders_with_ppl, columns=["Binder", "Perplexity"])
64
-
65
- # Save the dataframe to a CSV file
66
- output_filename = "output.csv"
67
- df.to_csv(output_filename, index=False)
68
-
69
-
70
- return binders_with_ppl, output_filename
71
-
72
-
73
- # Define the Gradio interface
74
- interface = gr.Interface(
75
- fn=generate_peptide,
76
- inputs=[
77
- gr.Textbox(label="Protein Sequence", info="Enter protein sequence here", type="text"),
78
- gr.Slider(3, 50, value=15, label="Peptide Length", step=1, info='Default value is 15'),
79
- gr.Slider(1, 10, value=3, label="Top K Value", step=1, info='Default value is 3'),
80
- gr.Dropdown(choices=[1, 2, 4, 8, 16, 32], label="Number of Binders", value=1)
81
- ],
82
- outputs=[
83
- gr.Dataframe(
84
- headers=["Binder", "Perplexity"],
85
- datatype=["str", "number"],
86
- col_count=(2, "fixed")
87
- ),
88
- gr.outputs.File(label="Download CSV")
89
- ],
90
- title="PepMLM: Target Sequence-Conditioned Generation of Peptide Binders via Masked Language Modeling"
91
- )
92
-
 
 
 
 
 
 
 
 
 
 
 
 
93
  interface.launch()
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
3
+ import torch
4
+ from torch.distributions.categorical import Categorical
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ # Load the model and tokenizer
9
+ tokenizer = AutoTokenizer.from_pretrained("TianlaiChen/PepMLM-650M")
10
+ model = AutoModelForMaskedLM.from_pretrained("TianlaiChen/PepMLM-650M")
11
+
12
+ def compute_pseudo_perplexity(model, tokenizer, protein_seq, binder_seq):
13
+ sequence = protein_seq + binder_seq
14
+ tensor_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
15
+ total_loss = 0
16
+
17
+ # Loop through each token in the binder sequence
18
+ for i in range(-len(binder_seq)-1, -1):
19
+ # Create a copy of the original tensor
20
+ masked_input = tensor_input.clone()
21
+
22
+ # Mask one token at a time
23
+ masked_input[0, i] = tokenizer.mask_token_id
24
+ # Create labels
25
+ labels = torch.full(tensor_input.shape, -100).to(model.device)
26
+ labels[0, i] = tensor_input[0, i]
27
+
28
+ # Get model prediction and loss
29
+ with torch.no_grad():
30
+ outputs = model(masked_input, labels=labels)
31
+ total_loss += outputs.loss.item()
32
+
33
+ # Calculate the average loss
34
+ avg_loss = total_loss / len(binder_seq)
35
+
36
+ # Calculate pseudo perplexity
37
+ pseudo_perplexity = np.exp(avg_loss)
38
+ return pseudo_perplexity
39
+
40
+
41
+ def generate_peptide(protein_seq, peptide_length, top_k, num_binders):
42
+
43
+ peptide_length = int(peptide_length)
44
+ top_k = int(top_k)
45
+ num_binders = int(num_binders)
46
+
47
+ binders_with_ppl = []
48
+
49
+ for _ in range(num_binders):
50
+ # Generate binder
51
+ masked_peptide = '<mask>' * peptide_length
52
+ input_sequence = protein_seq + masked_peptide
53
+ inputs = tokenizer(input_sequence, return_tensors="pt").to(model.device)
54
+
55
+ with torch.no_grad():
56
+ logits = model(**inputs).logits
57
+ mask_token_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
58
+ logits_at_masks = logits[0, mask_token_indices]
59
+
60
+ # Apply top-k sampling
61
+ top_k_logits, top_k_indices = logits_at_masks.topk(top_k, dim=-1)
62
+ probabilities = torch.nn.functional.softmax(top_k_logits, dim=-1)
63
+ predicted_indices = Categorical(probabilities).sample()
64
+ predicted_token_ids = top_k_indices.gather(-1, predicted_indices.unsqueeze(-1)).squeeze(-1)
65
+
66
+ generated_binder = tokenizer.decode(predicted_token_ids, skip_special_tokens=True).replace(' ', '')
67
+
68
+ # Compute PPL for the generated binder
69
+ ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
70
+
71
+ # Add the generated binder and its PPL to the results list
72
+ binders_with_ppl.append([generated_binder, ppl_value])
73
+
74
+ # Convert the list of lists to a pandas dataframe
75
+ df = pd.DataFrame(binders_with_ppl, columns=["Binder", "Perplexity"])
76
+
77
+ # Save the dataframe to a CSV file
78
+ output_filename = "output.csv"
79
+ df.to_csv(output_filename, index=False)
80
+
81
+
82
+ return binders_with_ppl, output_filename
83
+
84
+
85
+ # Define the Gradio interface
86
+ interface = gr.Interface(
87
+ fn=generate_peptide,
88
+ inputs=[
89
+ gr.Textbox(label="Protein Sequence", info="Enter protein sequence here", type="text"),
90
+ gr.Slider(3, 50, value=15, label="Peptide Length", step=1, info='Default value is 15'),
91
+ gr.Slider(1, 10, value=3, label="Top K Value", step=1, info='Default value is 3'),
92
+ gr.Dropdown(choices=[1, 2, 4, 8, 16, 32], label="Number of Binders", value=1)
93
+ ],
94
+ outputs=[
95
+ gr.Dataframe(
96
+ headers=["Binder", "Perplexity"],
97
+ datatype=["str", "number"],
98
+ col_count=(2, "fixed")
99
+ ),
100
+ gr.outputs.File(label="Download CSV")
101
+ ],
102
+ title="PepMLM: Target Sequence-Conditioned Generation of Peptide Binders via Masked Language Modeling"
103
+ )
104
+
105
  interface.launch()