Add README for instanovo-phospho-v1.0.0
Browse files
README.md
CHANGED
|
@@ -1,10 +1,221 @@
|
|
| 1 |
---
|
|
|
|
|
|
|
| 2 |
tags:
|
| 3 |
-
-
|
| 4 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
---
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
license: cc-by-nc-sa-4.0
|
| 3 |
+
library_name: pytorch
|
| 4 |
tags:
|
| 5 |
+
- proteomics
|
| 6 |
+
- mass-spectrometry
|
| 7 |
+
- peptide-sequencing
|
| 8 |
+
- de-novo-sequencing
|
| 9 |
+
- phosphoproteomics
|
| 10 |
+
- post-translational-modifications
|
| 11 |
+
- transformer
|
| 12 |
+
- biology
|
| 13 |
+
- computational-biology
|
| 14 |
+
pipeline_tag: text-generation
|
| 15 |
+
datasets:
|
| 16 |
+
- InstaDeepAI/InstaNovo-P
|
| 17 |
---
|
| 18 |
|
| 19 |
+
# InstaNovo-P: De novo Peptide Sequencing Model for Phosphoproteomics
|
| 20 |
+
|
| 21 |
+
## Model Description
|
| 22 |
+
|
| 23 |
+
InstaNovo-P is a specialized transformer-based model for de novo peptide sequencing from phosphoproteomics mass spectrometry data. This model is specifically trained and optimized for identifying phosphorylated peptides and their modification sites. The model predicts peptide sequences directly from MS/MS spectra with enhanced capabilities for detecting and localizing phosphorylation sites, making it particularly valuable for phosphoproteomics studies and PTM discovery.
|
| 24 |
+
|
| 25 |
+
## Usage
|
| 26 |
+
|
| 27 |
+
```python
|
| 28 |
+
import torch
|
| 29 |
+
import numpy as np
|
| 30 |
+
import pandas as pd
|
| 31 |
+
from instanovo.transformer.model import InstaNovo
|
| 32 |
+
from instanovo.utils import SpectrumDataFrame
|
| 33 |
+
from instanovo.transformer.dataset import SpectrumDataset, collate_batch
|
| 34 |
+
from torch.utils.data import DataLoader
|
| 35 |
+
from instanovo.inference import ScoredSequence
|
| 36 |
+
from instanovo.inference import BeamSearchDecoder
|
| 37 |
+
from instanovo.utils.metrics import Metrics
|
| 38 |
+
from tqdm.notebook import tqdm
|
| 39 |
+
|
| 40 |
+
# Load the model from the Hugging Face Hub
|
| 41 |
+
model, config = InstaNovo.from_pretrained("InstaDeepAI/instanovo-phospho-v1.0.0")
|
| 42 |
+
|
| 43 |
+
# Move the model to the GPU if available
|
| 44 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 45 |
+
model = model.to(device).eval()
|
| 46 |
+
|
| 47 |
+
# Update the residue set with custom modifications
|
| 48 |
+
model.residue_set.update_remapping(
|
| 49 |
+
{
|
| 50 |
+
"M(ox)": "M[UNIMOD:35]",
|
| 51 |
+
"M(+15.99)": "M[UNIMOD:35]",
|
| 52 |
+
"S(p)": "S[UNIMOD:21]", # Phosphorylation
|
| 53 |
+
"T(p)": "T[UNIMOD:21]",
|
| 54 |
+
"Y(p)": "Y[UNIMOD:21]",
|
| 55 |
+
"S(+79.97)": "S[UNIMOD:21]",
|
| 56 |
+
"T(+79.97)": "T[UNIMOD:21]",
|
| 57 |
+
"Y(+79.97)": "Y[UNIMOD:21]",
|
| 58 |
+
"Q(+0.98)": "Q[UNIMOD:7]", # Deamidation
|
| 59 |
+
"N(+0.98)": "N[UNIMOD:7]",
|
| 60 |
+
"Q(+.98)": "Q[UNIMOD:7]",
|
| 61 |
+
"N(+.98)": "N[UNIMOD:7]",
|
| 62 |
+
"C(+57.02)": "C[UNIMOD:4]", # Carboxyamidomethylation
|
| 63 |
+
"(+42.01)": "[UNIMOD:1]", # Acetylation
|
| 64 |
+
"(+43.01)": "[UNIMOD:5]", # Carbamylation
|
| 65 |
+
"(-17.03)": "[UNIMOD:385]",
|
| 66 |
+
}
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# Load the test data
|
| 70 |
+
sdf = SpectrumDataFrame.from_huggingface(
|
| 71 |
+
"InstaDeepAI/InstaNovo-P",
|
| 72 |
+
is_annotated=True,
|
| 73 |
+
shuffle=False,
|
| 74 |
+
split="test[:10%]", # Let's only use a subset of the test data for faster inference
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Create the dataset
|
| 78 |
+
ds = SpectrumDataset(
|
| 79 |
+
sdf,
|
| 80 |
+
model.residue_set,
|
| 81 |
+
config.get("n_peaks", 200),
|
| 82 |
+
return_str=True,
|
| 83 |
+
annotated=True,
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Create the data loader
|
| 87 |
+
dl = DataLoader(ds, batch_size=64, shuffle=False, num_workers=0, collate_fn=collate_batch)
|
| 88 |
+
|
| 89 |
+
# Create the decoder
|
| 90 |
+
decoder = BeamSearchDecoder(model=model)
|
| 91 |
+
|
| 92 |
+
# Initialize lists to store predictions and targets
|
| 93 |
+
preds = []
|
| 94 |
+
targs = []
|
| 95 |
+
probs = []
|
| 96 |
+
|
| 97 |
+
# Iterate over the data loader
|
| 98 |
+
for _, batch in tqdm(enumerate(dl), total=len(dl)):
|
| 99 |
+
spectra, precursors, _, peptides, _ = batch
|
| 100 |
+
spectra = spectra.to(device)
|
| 101 |
+
precursors = precursors.to(device)
|
| 102 |
+
|
| 103 |
+
# Perform inference
|
| 104 |
+
with torch.no_grad():
|
| 105 |
+
p = decoder.decode(
|
| 106 |
+
spectra=spectra,
|
| 107 |
+
precursors=precursors,
|
| 108 |
+
beam_size=config["n_beams"],
|
| 109 |
+
max_length=config["max_length"],
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
preds += [x.sequence if isinstance(x, ScoredSequence) else [] for x in p]
|
| 114 |
+
probs += [
|
| 115 |
+
x.sequence_log_probability if isinstance(x, ScoredSequence) else -float("inf") for x in p
|
| 116 |
+
]
|
| 117 |
+
targs += list(peptides)
|
| 118 |
+
|
| 119 |
+
# Initialize metrics
|
| 120 |
+
metrics = Metrics(model.residue_set, config["isotope_error_range"])
|
| 121 |
+
|
| 122 |
+
# Compute precision and recall
|
| 123 |
+
aa_precision, aa_recall, peptide_recall, peptide_precision = metrics.compute_precision_recall(
|
| 124 |
+
peptides, preds
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# Compute amino acid error rate and AUC
|
| 128 |
+
aa_error_rate = metrics.compute_aa_er(targs, preds)
|
| 129 |
+
auc = metrics.calc_auc(targs, preds, np.exp(pd.Series(probs)))
|
| 130 |
+
|
| 131 |
+
print(f"amino acid error rate: {aa_error_rate:.5f}")
|
| 132 |
+
print(f"amino acid precision: {aa_precision:.5f}")
|
| 133 |
+
print(f"amino acid recall: {aa_recall:.5f}")
|
| 134 |
+
print(f"peptide precision: {peptide_precision:.5f}")
|
| 135 |
+
print(f"peptide recall: {peptide_recall:.5f}")
|
| 136 |
+
print(f"area under the PR curve: {auc:.5f}")
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
For more explanation, see the [Getting Started notebook](https://github.com/instadeepai/InstaNovo/blob/main/notebooks/getting_started_with_instanovo.ipynb) in the repository.
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
## Citation
|
| 143 |
+
|
| 144 |
+
If you use InstaNovo-P in your research, please cite:
|
| 145 |
+
|
| 146 |
+
```bibtex
|
| 147 |
+
@article {Lauridsen2025.05.14.654049,
|
| 148 |
+
title = {InstaNovo-P: A de novo peptide sequencing model for phosphoproteomics},
|
| 149 |
+
author = {Lauridsen, Jesper and Ramasamy, Pathmanaban and Catzel, Rachel and Canbay, Vahap
|
| 150 |
+
and Mabona, Amandla and Eloff, Kevin and Fullwood, Paul and Ferguson, Jennifer and
|
| 151 |
+
Kirketerp-M{\o}ller, Annekatrine and Goldschmidt, Ida Sofie and Claeys, Tine and van
|
| 152 |
+
Puyenbroeck, Sam and Lopez Carranza, Nicolas and Schoof, Erwin M. and Martens, Lennart and
|
| 153 |
+
Van Goey, Jeroen and Francavilla, Chiara and Jenkins, Timothy Patrick and Kalogeropoulos,
|
| 154 |
+
Konstantinos},
|
| 155 |
+
elocation-id = {2025.05.14.654049},
|
| 156 |
+
year = {2025},
|
| 157 |
+
doi = {10.1101/2025.05.14.654049},
|
| 158 |
+
publisher = {Cold Spring Harbor Laboratory},
|
| 159 |
+
URL = {https://www.biorxiv.org/content/early/2025/05/18/2025.05.14.654049},
|
| 160 |
+
eprint = {https://www.biorxiv.org/content/early/2025/05/18/2025.05.14.654049.full.pdf},
|
| 161 |
+
journal = {bioRxiv}
|
| 162 |
+
}
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
For the general InstaNovo model, please cite:
|
| 166 |
+
|
| 167 |
+
```bibtex
|
| 168 |
+
@article{eloff_kalogeropoulos_2025_instanovo,
|
| 169 |
+
title = {InstaNovo enables diffusion-powered de novo peptide sequencing in large-scale
|
| 170 |
+
proteomics experiments},
|
| 171 |
+
author = {Eloff, Kevin and Kalogeropoulos, Konstantinos and Mabona, Amandla and Morell,
|
| 172 |
+
Oliver and Catzel, Rachel and Rivera-de-Torre, Esperanza and Berg Jespersen,
|
| 173 |
+
Jakob and Williams, Wesley and van Beljouw, Sam P. B. and Skwark, Marcin J.
|
| 174 |
+
and Laustsen, Andreas Hougaard and Brouns, Stan J. J. and Ljungars,
|
| 175 |
+
Anne and Schoof, Erwin M. and Van Goey, Jeroen and auf dem Keller, Ulrich and
|
| 176 |
+
Beguir, Karim and Lopez Carranza, Nicolas and Jenkins, Timothy P.},
|
| 177 |
+
year = {2025},
|
| 178 |
+
month = {Mar},
|
| 179 |
+
day = {31},
|
| 180 |
+
journal = {Nature Machine Intelligence},
|
| 181 |
+
doi = {10.1038/s42256-025-01019-5},
|
| 182 |
+
issn = {2522-5839},
|
| 183 |
+
url = {https://doi.org/10.1038/s42256-025-01019-5}
|
| 184 |
+
}
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
## Resources
|
| 190 |
+
|
| 191 |
+
- **Code Repository**: [https://github.com/instadeepai/InstaNovo](https://github.com/instadeepai/InstaNovo)
|
| 192 |
+
- **Documentation**: [https://instadeepai.github.io/InstaNovo/](https://instadeepai.github.io/InstaNovo/)
|
| 193 |
+
- **Publication**: [https://www.nature.com/articles/s42256-025-01019-5](https://www.nature.com/articles/s42256-025-01019-5)
|
| 194 |
+
- **Preprint**: [https://www.biorxiv.org/content/10.1101/2025.05.14.654049v1](https://www.biorxiv.org/content/10.1101/2025.05.14.654049v1)
|
| 195 |
+
|
| 196 |
+
## License
|
| 197 |
+
|
| 198 |
+
- **Code**: Licensed under Apache License 2.0
|
| 199 |
+
- **Model Checkpoints**: Licensed under Creative Commons Non-Commercial (CC BY-NC-SA 4.0)
|
| 200 |
+
|
| 201 |
+
## Installation
|
| 202 |
+
|
| 203 |
+
```bash
|
| 204 |
+
pip install instanovo
|
| 205 |
+
```
|
| 206 |
+
|
| 207 |
+
For GPU support, install with CUDA dependencies:
|
| 208 |
+
```bash
|
| 209 |
+
pip install instanovo[cu126]
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
## Requirements
|
| 213 |
+
|
| 214 |
+
- Python >= 3.10, < 3.13
|
| 215 |
+
- PyTorch >= 1.13.0
|
| 216 |
+
- CUDA (optional, for GPU acceleration)
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
## Support
|
| 220 |
+
|
| 221 |
+
For questions, issues, or contributions, please visit the [GitHub repository](https://github.com/instadeepai/InstaNovo) or check the [documentation](https://instadeepai.github.io/InstaNovo/).
|