Spaces:

thaidaev
/

zsp

Running

App Files Files

Massimo G. Totaro commited on Nov 14, 2024

Commit

ddc1bd3

1 Parent(s): fba8f5e

QOL and gradio upgrade

Browse files

Files changed (7) hide show

.gitignore +2 -1
README.md +1 -1
app.py +14 -26
data.py +22 -31
instructions.md +58 -36
model.py +11 -7
requirements.txt +1 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 Dockerfile
 *.ipynb
-*/

 Dockerfile
 *.ipynb
+out.*
+*/

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 📈
 colorFrom: gray
 colorTo: red
 sdk: gradio
-sdk_version: 4.8.0
 app_file: app.py
 pinned: false
 license: bsd-2-clause

 colorFrom: gray
 colorTo: red
 sdk: gradio
+sdk_version: 5.5.0
 app_file: app.py
 pinned: false
 license: bsd-2-clause

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
-from tempfile import NamedTemporaryFile
-from gradio import Blocks, Button, Checkbox, Dropdown, Examples, File, HTML, Markdown, Textbox
 from model import get_models
 from data import Data
@@ -17,19 +16,14 @@ def app(*argv):
     # Unpack the arguments
     seq, trg, model_name, *_ = argv
     scoring = SCORING[scoring_strategy.value]
-    try:
-        # Calculate the data based on the input parameters
-        data = Data(seq, trg, model_name, scoring, out_file).calculate()
-    except Exception as e:
-        # If an error occurs, return an HTML error message
-        return f'<!DOCTYPE html><html><body><h1 style="background-color:#F70D1A;text-align:center;">Error: {str(e)}</h1></body></html>', None
     # If no error occurs, return the calculated data
-    return repr(data), File(value=out_file.name, visible=True)
 # Create the Gradio interface
-with open("instructions.md", "r", encoding="utf-8") as md,\
-     NamedTemporaryFile(mode='w+')                  as out_file,\
-     Blocks()                                       as esm_scan:
     # Define the interface components
     Markdown(md.read())
@@ -46,20 +40,14 @@ with open("instructions.md", "r", encoding="utf-8") as md,\
         value=""
     )
     model_name = Dropdown(MODELS, label="Model", value="facebook/esm2_t30_150M_UR50D")
-    scoring_strategy = Checkbox(value=True, label="Use masked-marginals scoring")
-    btn = Button(value="Run")
-    out = HTML()
-    bto = File(
-        value=out_file.name,
-        visible=False,
-        label="Download",
-        file_count='single',
-        interactive=False
-    )
     btn.click(
         fn=app,
         inputs=[seq, trg, model_name],
-        outputs=[out, bto]
     )
     ex = Examples(
         examples=[
@@ -87,9 +75,9 @@ with open("instructions.md", "r", encoding="utf-8") as md,\
         inputs=[seq,
                 trg,
                 model_name],
-        outputs=[out,
-                 bto],
-        fn=app
     )
 # Launch the Gradio interface

+from gradio import Blocks, Button, Checkbox, DownloadButton, Dropdown, Examples, File, Image, Markdown, Textbox
 from model import get_models
 from data import Data
     # Unpack the arguments
     seq, trg, model_name, *_ = argv
     scoring = SCORING[scoring_strategy.value]
+    # Calculate the data based on the input parameters
+    data = Data(seq, trg, model_name, scoring).calculate()
     # If no error occurs, return the calculated data
+    return Image(value=data.image(), type='filepath', visible=True), DownloadButton(value=data.csv(), visible=True)
 # Create the Gradio interface
+with open("instructions.md", "r", encoding="utf-8") as md, Blocks() as esm_scan:
     # Define the interface components
     Markdown(md.read())
         value=""
     )
     model_name = Dropdown(MODELS, label="Model", value="facebook/esm2_t30_150M_UR50D")
+    scoring_strategy = Checkbox(value=True, label="Use higher accuracy scoring", interactive=True)
+    dlb = DownloadButton(label="Download raw data", visible=False)
+    out = Image(visible=False)
+    btn = Button(value="Run", variant="primary")
     btn.click(
         fn=app,
         inputs=[seq, trg, model_name],
+        outputs=[out, dlb]
     )
     ex = Examples(
         examples=[
         inputs=[seq,
                 trg,
                 model_name],
+        outputs=[out],
+        fn=app,
+        cache_examples=False
     )
 # Launch the Gradio interface

data.py CHANGED Viewed

@@ -1,12 +1,8 @@
 from math import ceil
-from re import match
-import seaborn as sns
-from model import Model
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 from model import Model
@@ -26,19 +22,18 @@ class Data:
         """Parse input substitutions"""
         self.mode = None
         self.sub = list()
-        self.trg = trg.strip().upper()
         self.resi = list()
         # Identify running mode
-        if len(self.trg.split()) == 1 and len(self.trg.split()[0]) == len(self.seq) and all(match(r'\w+', x) for x in self.trg):
             # If single string of same length as sequence, seq vs seq mode
             self.mode = 'MUT'
-            for resi, (src, trg) in enumerate(zip(self.seq, self.trg), 1):
                 if src != trg:
                     self.sub.append(f"{src}{resi}{trg}")
                     self.resi.append(resi)
         else:
-            self.trg = self.trg.split()
             if all(match(r'\d+', x) for x in self.trg):
                 # If all strings are numbers, deep mutational scanning mode
                 self.mode = 'DMS'
@@ -64,7 +59,7 @@ class Data:
         self.sub = pd.DataFrame(self.sub, columns=['0'])
-    def __init__(self, src:str, trg:str, model_name:str='facebook/esm2_t33_650M_UR50D', scoring_strategy:str='masked-marginals', out_file=None):
         "initialise data"
             # if model has changed, load new model
         if self.model.model_name != model_name:
@@ -76,13 +71,14 @@ class Data:
         self.scoring_strategy = scoring_strategy
         self.token_probs = None
         self.out = pd.DataFrame(self.sub, columns=['0', self.model_name])
-        self.out_str = None
-        self.out_buffer = out_file.name if 'name' in dir(out_file) else out_file
     def parse_output(self) -> None:
         "format output data for visualisation"
         if self.mode == 'TMS':
             self.process_tms_mode()
         else:
             if self.mode == 'DMS':
                 self.sort_by_residue_and_score()
@@ -90,14 +86,12 @@ class Data:
                 self.sort_by_score()
             else:
                 raise RuntimeError(f"Unrecognised mode {self.mode}")
-            if self.out_buffer:
-                self.out.round(2).to_csv(self.out_buffer, index=False, header=False)
-            self.out_str = (self.out.style
-                            .format(lambda x: f'{x:.2f}' if isinstance(x, float) else x)
-                            .hide(axis=0)
-                            .hide(axis=1)
-                            .background_gradient(cmap="RdYlGn", vmax=8, vmin=-8)
-                            .to_html(justify='center'))
     def sort_by_score(self):
         self.out = self.out.sort_values(self.model_name, ascending=False)
@@ -155,10 +149,7 @@ class Data:
         else:
             self.plot_multiple_heatmaps(ncols, nrows)
-        if self.out_buffer:
-            plt.savefig(self.out_buffer, format='svg')
-            with open(self.out_buffer, 'r', encoding='utf-8') as f:
-                self.out_str = f.read()
     def plot_single_heatmap(self):
         fig = plt.figure(figsize=(12, 6))
@@ -200,10 +191,10 @@ class Data:
         self.parse_output()
         return self
-    def __str__(self):
-        "return output data in DataFrame format"
-        return str(self.out)
-    def __repr__(self):
-        "return output data in html format"
-        return self.out_str

+import dataframe_image as dfi
 from math import ceil
 import matplotlib.pyplot as plt
 import pandas as pd
+from re import match
 import seaborn as sns
 from model import Model
         """Parse input substitutions"""
         self.mode = None
         self.sub = list()
+        self.trg = trg.strip().upper().split()
         self.resi = list()
         # Identify running mode
+        if len(self.trg) == 1 and len(self.trg[0]) == len(self.seq) and match(r'^\w+$', self.trg[0]):
             # If single string of same length as sequence, seq vs seq mode
             self.mode = 'MUT'
+            for resi, (src, trg) in enumerate(zip(self.seq, self.trg[0]), 1):
                 if src != trg:
                     self.sub.append(f"{src}{resi}{trg}")
                     self.resi.append(resi)
         else:
             if all(match(r'\d+', x) for x in self.trg):
                 # If all strings are numbers, deep mutational scanning mode
                 self.mode = 'DMS'
         self.sub = pd.DataFrame(self.sub, columns=['0'])
+    def __init__(self, src:str, trg:str, model_name:str='facebook/esm2_t33_650M_UR50D', scoring_strategy:str='masked-marginals', out_file='out'):
         "initialise data"
             # if model has changed, load new model
         if self.model.model_name != model_name:
         self.scoring_strategy = scoring_strategy
         self.token_probs = None
         self.out = pd.DataFrame(self.sub, columns=['0', self.model_name])
+        self.out_img = f'{out_file}.png'
+        self.out_csv = f'{out_file}.csv'
     def parse_output(self) -> None:
         "format output data for visualisation"
         if self.mode == 'TMS':
             self.process_tms_mode()
+            self.out.to_csv(self.out_csv, float_format='%.2f')
         else:
             if self.mode == 'DMS':
                 self.sort_by_residue_and_score()
                 self.sort_by_score()
             else:
                 raise RuntimeError(f"Unrecognised mode {self.mode}")
+            out_df = (self.out.style
+                        .format(lambda x: f'{x:.2f}' if isinstance(x, float) else x)
+                        .hide(axis=0).hide(axis=1)
+                        .background_gradient(cmap="RdYlGn", vmax=8, vmin=-8))
+            dfi.export(out_df, self.out_img, max_rows=-1, max_cols=-1, dpi=300)
+            self.out.to_csv(self.out_csv, float_format='%.2f', index=False, header=False)
     def sort_by_score(self):
         self.out = self.out.sort_values(self.model_name, ascending=False)
         else:
             self.plot_multiple_heatmaps(ncols, nrows)
+        plt.savefig(self.out_img, format='png', dpi=300)
     def plot_single_heatmap(self):
         fig = plt.figure(figsize=(12, 6))
         self.parse_output()
         return self
+    def csv(self):
+        "return output data"
+        return self.out_csv
+    def image(self):
+        "return output data"
+        return self.out_img

instructions.md CHANGED Viewed

@@ -1,39 +1,61 @@
 # **ESM-Scan**
 Calculate the <u>fitness of single amino acid substitutions</u> on proteins, using a [zero-shot](https://doi.org/10.1101/2021.07.09.450648) [language model predictor](https://github.com/facebookresearch/esm)
-  <details>
-    <summary> <b> USAGE INSTRUCTIONS </b> </summary>
-### **Setup**
-No setup is required, just fill the input boxes with the required data and click on the `Run` button.
-A list of examples can be found at the bottom of the page, click on them to autofill the fields.
-If the server is not used for some time, it will go into standby.
-Running a calculation resumes the tool from standby, the first run might take longer due to startup and model loading.
-### **Input**
-- write the protein full amino acid sequence to be analysed in the **Sequence** text box
-  jolly charachters (e.g. `-X.B`) can be inserted but, at the moment, visualisation cannot handle them
-- write the substitutions to test in the **Substitutions** box
-  there are three running modes that can be used, depending on the input:
-  + *single substitution* or list thereof (in the form of `R218K R218W`): the single substitution is scored
-  + *residue position* or list thereof: all possible substitutions will be evaluated
-  + *same-length sequence*: the differing amino acid substitutions will be evaluated, one by one
-  + any other *different input*: a deep mutational scan of the full sequence will be performed
-- the ESM model to use for the calculations can be chosen among those that are available on Hugging Face Model Hub;
-  `esm2_t33_650M_UR50D` offers the best expense-accuracy tradeoff[*](https://doi.org/10.1126/science.ade2574)
-- the `masked-marginals` scoring strategy considers sequence context at inference time, being slower but more accurate;
-  in case of long runtimes, you can tick the box off to speed the calculations up significantly, sacrificing accuracy
-- when running a deep mutational scan, it is recommended to use smaller models (8M, 35M, 150M parameters), since the runtime is significant, especially for longer sequences and the server might be overloaded;
-  over 30 min might be necessary for calculating a 300-residue-long sequence with larger models
-  in general, accuracy is influenced significantly by the scoring strategy and less so by the model size, so it is suggested to reduce the latter first when optimising for runtime;
-  the scoring strategy computational cost scales with the number of substitutions tested, while the model’s with the wild-type sequence length
-- it is possible to calculate the effect of multiple concurrent substitutions, but this has to be done manually, by changing the input sequence and running the calculation again
-### **Output**
-Your results will be shown in a color-coded table, except for the deep mutational scan which will yield a heatmap.
-The output data can be downloaded from the box at the bottom.
-File extensions are not supported by the server and need to be appended to the filenames after downloading:
-- `CSV` for tables
-- `SVG` for full-sequence deep mutational scan
-  </details>

 # **ESM-Scan**
 Calculate the <u>fitness of single amino acid substitutions</u> on proteins, using a [zero-shot](https://doi.org/10.1101/2021.07.09.450648) [language model predictor](https://github.com/facebookresearch/esm)
+<details>
+  <summary> <b> USAGE INSTRUCTIONS </b> </summary>
+## Setup
+No setup is required. Simply fill in the input boxes with the necessary data and click the **Run** button.
+You can find a list of examples at the bottom of the page; clicking on them will autofill the fields for you.
+If the server remains idle for a period, it will enter standby mode. Running a calculation will wake the tool from standby, but note that the first run may take longer due to startup and model loading.
+## Input
+**Sequence**: Enter the full amino acid sequence to be analyzed in the **Sequence** text box.
+  Note: While jolly characters (e.g., `-X.B`) can be included, they currently cannot be visualised.
+**Substitutions**: Specify the substitutions you wish to test in the **Substitutions** box. The tool supports three running modes based on your input:
+- **Single Substitution**: Input one or more substitutions (e.g. `R218K R218W`) to score specific changes.
+- **Residue Position**: Provide residue positions to evaluate all possible substitutions at those sites.
+- **Same-Length Sequence**: Analyze differing amino acid substitutions one by one within sequences of equal length.
+- **Different Inputs**: For any other input format, a deep mutational scan of the full sequence will be performed.
+**Model Selection**: Choose an ESM model for calculations from those available on Hugging Face Model Hub.
+  The model `esm2_t33_650M_UR50D` offers an optimal balance between cost and accuracy [*](https://doi.org/10.1126/science.ade2574).
+**Accuracy Option**: The **Use higher accuracy** option applies a masked-marginals scoring strategy, which considers sequence context during inference.
+  While this method is slower, it enhances accuracy. If you experience long runtimes, unchecking this option can significantly speed up calculations at the cost of some accuracy.
+**Deep Mutational Scan Recommendations**: When performing a deep mutational scan, it is advisable to use smaller models (8M, 35M, or 150M parameters) due to significant runtime concerns—especially with longer sequences or during peak server usage times.
+  For example, calculating a 300-residue-long sequence with larger models may require over 30 minutes.
+  Generally, accuracy is more affected by the scoring strategy than by model size; therefore, prioritise reducing model size when optimizing for runtime.
+  The computational cost of the scoring strategy scales with the number of substitutions tested, while model cost scales with wild-type sequence length.
+**Concurrent Substitutions**: To calculate the effect of multiple concurrent substitutions, you must manually change the input sequence and rerun the calculation. Accuracy is not guaranteed as this use case is yet untested.
+## Output
+Results are displayed in a color-coded table, except for deep mutational scans, which produce a heatmap.
+In the table:
+- Beneficial substitutions are highlighted in blue with positive values.
+- Detrimental substitutions appear in red with negative values.
+As a rule of thumb, score differences of *4* or more are considered significant. For instance:
+- A substitution scoring *-6* is likely detrimental to protein functionality.
+- A score of *+2* is generally regarded as neutral.
+You can download the output raw data from the **button at the bottom of the page.
+<b>
+If you use this tool in your research, please cite:
+- Totaro, M.G. (2023). “ESM-Scan - a tool to guide amino acid substitutions.” bioRxiv. [doi.org/10.1101/2023.12.12.571273](https://doi.org/10.1101/2023.12.12.571273)
+- Meier, J. (2021). “Language Models Enable Zero-Shot Prediction of the Effects of Mutations on Protein Function.” bioRxiv (Cold Spring Harbor Laboratory), July. [doi.org/10.1101/2021.07.09.450648](https://doi.org/10.1101/2021.07.09.450648)
+</b>
+</details>

model.py CHANGED Viewed

@@ -1,5 +1,6 @@
-from huggingface_hub import HfApi, ModelFilter
 import torch
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 from transformers.tokenization_utils_base import BatchEncoding
 from transformers.modeling_outputs import MaskedLMOutput
@@ -10,9 +11,9 @@ def get_models() -> list[None|str]:
     if not any(
         out := [
             m.modelId for m in HfApi().list_models(
-                filter=ModelFilter(
-                    author="facebook", model_name="esm", task="fill-mask"
-                ),
                 sort="lastModified",
                 direction=-1
             )
@@ -34,6 +35,9 @@ class Model:
             # Check if CUDA is available and if so, use it
             if torch.cuda.is_available():
                 self.model = self.model.cuda()
     def tokenise(self, input: str) -> BatchEncoding:
         """Convert input string to batch of tokens."""
@@ -41,7 +45,7 @@ class Model:
     def __call__(self, batch_tokens: torch.Tensor, **kwargs) -> MaskedLMOutput:
         """Run model on batch of tokens."""
-        return self.model(batch_tokens, **kwargs)
     def __getitem__(self, key: str) -> int:
         """Get token ID from character."""
@@ -70,7 +74,7 @@ class Model:
         if data.scoring_strategy.startswith("masked-marginals"):
             all_token_probs = []
             # For each token in the batch
-            for i in range(batch_tokens.size()[1]):
                 # If the token is in the list of residues
                 if i in data.resi:
                     # Clone the batch tokens and mask the current token
@@ -96,4 +100,4 @@ class Model:
                 token_probs,
             ),
             axis=1,
-        )

+from huggingface_hub import HfApi
 import torch
+from tqdm import tqdm
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 from transformers.tokenization_utils_base import BatchEncoding
 from transformers.modeling_outputs import MaskedLMOutput
     if not any(
         out := [
             m.modelId for m in HfApi().list_models(
+                author="facebook",
+                model_name="esm",
+                task="fill-mask",
                 sort="lastModified",
                 direction=-1
             )
             # Check if CUDA is available and if so, use it
             if torch.cuda.is_available():
                 self.model = self.model.cuda()
+                self.device = torch.device("cuda")
+            else:
+                self.device = torch.device("cpu")
     def tokenise(self, input: str) -> BatchEncoding:
         """Convert input string to batch of tokens."""
     def __call__(self, batch_tokens: torch.Tensor, **kwargs) -> MaskedLMOutput:
         """Run model on batch of tokens."""
+        return self.model(batch_tokens.to(self.device), **kwargs)
     def __getitem__(self, key: str) -> int:
         """Get token ID from character."""
         if data.scoring_strategy.startswith("masked-marginals"):
             all_token_probs = []
             # For each token in the batch
+            for i in tqdm(range(batch_tokens.size()[1])):
                 # If the token is in the list of residues
                 if i in data.resi:
                     # Clone the batch tokens and mask the current token
                 token_probs,
             ),
             axis=1,
+        )

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 gradio
 pandas
 seaborn

+dataframe-image
 gradio
 pandas
 seaborn