Spaces:

AmelieSchreiber
/

variant_effects_llr

Runtime error

App Files Files Community

AmelieSchreiber commited on Dec 8, 2023

Commit

f3ea76e

1 Parent(s): c88b154

Create app.py

Browse files

Files changed (1) hide show

app.py +90 -0

app.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import gradio as gr
+from transformers import AutoTokenizer, EsmForMaskedLM
+import torch
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+def generate_heatmap(protein_sequence, start_pos=1, end_pos=None):
+    # Load the model and tokenizer
+    model_name = "facebook/esm2_t6_8M_UR50D"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = EsmForMaskedLM.from_pretrained(model_name)
+    # Tokenize the input sequence
+    input_ids = tokenizer.encode(protein_sequence, return_tensors="pt")
+    sequence_length = input_ids.shape[1] - 2  # Excluding the special tokens
+    # Adjust end position if not specified
+    if end_pos is None:
+        end_pos = sequence_length
+    # List of amino acids
+    amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
+    # Initialize heatmap
+    heatmap = np.zeros((20, end_pos - start_pos + 1))
+    # Calculate LLRs for each position and amino acid
+    for position in range(start_pos, end_pos + 1):
+        # Mask the target position
+        masked_input_ids = input_ids.clone()
+        masked_input_ids[0, position] = tokenizer.mask_token_id
+        # Get logits for the masked token
+        with torch.no_grad():
+            logits = model(masked_input_ids).logits
+        # Calculate log probabilities
+        probabilities = torch.nn.functional.softmax(logits[0, position], dim=0)
+        log_probabilities = torch.log(probabilities)
+        # Get the log probability of the wild-type residue
+        wt_residue = input_ids[0, position].item()
+        log_prob_wt = log_probabilities[wt_residue].item()
+        # Calculate LLR for each variant
+        for i, amino_acid in enumerate(amino_acids):
+            log_prob_mt = log_probabilities[tokenizer.convert_tokens_to_ids(amino_acid)].item()
+            heatmap[i, position - start_pos] = log_prob_mt - log_prob_wt
+    # Visualize the heatmap
+    plt.figure(figsize=(15, 5))
+    plt.imshow(heatmap, cmap="viridis_r", aspect="auto")
+    plt.xticks(range(end_pos - start_pos + 1), list(protein_sequence[start_pos-1:end_pos]))
+    plt.yticks(range(20), amino_acids)
+    plt.xlabel("Position in Protein Sequence")
+    plt.ylabel("Amino Acid Mutations")
+    plt.title("Predicted Effects of Mutations on Protein Sequence (LLR)")
+    plt.colorbar(label="Log Likelihood Ratio (LLR)")
+    plt.show()
+    # Save the plot to a temporary file and return the file path
+    temp_file = "temp_heatmap.png"
+    plt.savefig(temp_file)
+    plt.close()
+    return temp_file
+def heatmap_interface(sequence, start, end):
+    # Ensure start and end positions are within bounds
+    if start < 1 or end > len(sequence):
+        return "Start or end position is out of bounds."
+    # Generate heatmap
+    heatmap_path = generate_heatmap(sequence, start, end)
+    return heatmap_path
+# Define the Gradio interface
+iface = gr.Interface(
+    fn=heatmap_interface,
+    inputs=[
+        gr.Textbox(lines=2, placeholder="Enter Protein Sequence Here..."),
+        gr.Number(label="Start Position", default=1),
+        gr.Number(label="End Position")
+    ],
+    outputs="image",
+    live=True
+)
+# Run the Gradio app
+iface.launch()