Update README.md
Browse files
README.md
CHANGED
@@ -10,12 +10,22 @@ In this work, we introduce **FusOn-pLM**, a novel pLM that fine-tunes the state-
|
|
10 |
|
11 |
```
|
12 |
from transformers import AutoTokenizer, AutoModel
|
|
|
13 |
import torch
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
# Load the tokenizer and model
|
16 |
model_name = "ChatterjeeLab/FusOn-pLM"
|
17 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
18 |
model = AutoModel.from_pretrained(model_name)
|
|
|
|
|
19 |
|
20 |
# Example fusion oncoprotein sequence: MLLT10:PICALM, associated with Acute Myeloid Leukemia (LAML)
|
21 |
# Amino acids 1-80 are derived from the head gene, MLLT10
|
@@ -23,16 +33,21 @@ model = AutoModel.from_pretrained(model_name)
|
|
23 |
sequence = "MVSSDRPVSLEDEVSHSMKEMIGGCCVCSDERGWAENPLVYCDGHGCSVAVHQACYGIVQVPTGPWFCRKCESQERAARVPPQMGSVPVMTQPTLIYSQPVMRPPNPFGPVSGAQIQFM"
|
24 |
|
25 |
# Tokenize the input sequence
|
26 |
-
inputs = tokenizer(sequence, return_tensors="pt")
|
|
|
27 |
|
28 |
# Get the embeddings
|
29 |
with torch.no_grad():
|
30 |
outputs = model(**inputs)
|
31 |
# The embeddings are in the last_hidden_state tensor
|
32 |
embeddings = outputs.last_hidden_state
|
|
|
|
|
|
|
|
|
33 |
|
34 |
# Convert embeddings to numpy array (if needed)
|
35 |
-
embeddings = embeddings.
|
36 |
|
37 |
print("Per-residue embeddings shape:", embeddings.shape)
|
38 |
|
|
|
10 |
|
11 |
```
|
12 |
from transformers import AutoTokenizer, AutoModel
|
13 |
+
import logging
|
14 |
import torch
|
15 |
|
16 |
+
# Suppress warnings about newly initialized 'esm.pooler.dense.bias', 'esm.pooler.dense.weight' layers - these are not used to extract embeddings
|
17 |
+
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
|
18 |
+
|
19 |
+
# Set device
|
20 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
21 |
+
print(f"Using device: {device}")
|
22 |
+
|
23 |
# Load the tokenizer and model
|
24 |
model_name = "ChatterjeeLab/FusOn-pLM"
|
25 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
26 |
model = AutoModel.from_pretrained(model_name)
|
27 |
+
model.to(device)
|
28 |
+
model.eval()
|
29 |
|
30 |
# Example fusion oncoprotein sequence: MLLT10:PICALM, associated with Acute Myeloid Leukemia (LAML)
|
31 |
# Amino acids 1-80 are derived from the head gene, MLLT10
|
|
|
33 |
sequence = "MVSSDRPVSLEDEVSHSMKEMIGGCCVCSDERGWAENPLVYCDGHGCSVAVHQACYGIVQVPTGPWFCRKCESQERAARVPPQMGSVPVMTQPTLIYSQPVMRPPNPFGPVSGAQIQFM"
|
34 |
|
35 |
# Tokenize the input sequence
|
36 |
+
inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True,max_length=2000)
|
37 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
38 |
|
39 |
# Get the embeddings
|
40 |
with torch.no_grad():
|
41 |
outputs = model(**inputs)
|
42 |
# The embeddings are in the last_hidden_state tensor
|
43 |
embeddings = outputs.last_hidden_state
|
44 |
+
# remove extra dimension
|
45 |
+
embeddings = embeddings.squeeze(0)
|
46 |
+
# remove BOS and EOS tokens
|
47 |
+
embeddings = embeddings[1:-1, :]
|
48 |
|
49 |
# Convert embeddings to numpy array (if needed)
|
50 |
+
embeddings = embeddings.numpy()
|
51 |
|
52 |
print("Per-residue embeddings shape:", embeddings.shape)
|
53 |
|