Fill-Mask
Transformers
PyTorch
esm
Inference Endpoints
svincoff commited on
Commit
d120392
1 Parent(s): e069ad9

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +17 -2
README.md CHANGED
@@ -10,12 +10,22 @@ In this work, we introduce **FusOn-pLM**, a novel pLM that fine-tunes the state-
10
 
11
  ```
12
  from transformers import AutoTokenizer, AutoModel
 
13
  import torch
14
 
 
 
 
 
 
 
 
15
  # Load the tokenizer and model
16
  model_name = "ChatterjeeLab/FusOn-pLM"
17
  tokenizer = AutoTokenizer.from_pretrained(model_name)
18
  model = AutoModel.from_pretrained(model_name)
 
 
19
 
20
  # Example fusion oncoprotein sequence: MLLT10:PICALM, associated with Acute Myeloid Leukemia (LAML)
21
  # Amino acids 1-80 are derived from the head gene, MLLT10
@@ -23,16 +33,21 @@ model = AutoModel.from_pretrained(model_name)
23
  sequence = "MVSSDRPVSLEDEVSHSMKEMIGGCCVCSDERGWAENPLVYCDGHGCSVAVHQACYGIVQVPTGPWFCRKCESQERAARVPPQMGSVPVMTQPTLIYSQPVMRPPNPFGPVSGAQIQFM"
24
 
25
  # Tokenize the input sequence
26
- inputs = tokenizer(sequence, return_tensors="pt")
 
27
 
28
  # Get the embeddings
29
  with torch.no_grad():
30
  outputs = model(**inputs)
31
  # The embeddings are in the last_hidden_state tensor
32
  embeddings = outputs.last_hidden_state
 
 
 
 
33
 
34
  # Convert embeddings to numpy array (if needed)
35
- embeddings = embeddings.squeeze(0).numpy()
36
 
37
  print("Per-residue embeddings shape:", embeddings.shape)
38
 
 
10
 
11
  ```
12
  from transformers import AutoTokenizer, AutoModel
13
+ import logging
14
  import torch
15
 
16
+ # Suppress warnings about newly initialized 'esm.pooler.dense.bias', 'esm.pooler.dense.weight' layers - these are not used to extract embeddings
17
+ logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
18
+
19
+ # Set device
20
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
+ print(f"Using device: {device}")
22
+
23
  # Load the tokenizer and model
24
  model_name = "ChatterjeeLab/FusOn-pLM"
25
  tokenizer = AutoTokenizer.from_pretrained(model_name)
26
  model = AutoModel.from_pretrained(model_name)
27
+ model.to(device)
28
+ model.eval()
29
 
30
  # Example fusion oncoprotein sequence: MLLT10:PICALM, associated with Acute Myeloid Leukemia (LAML)
31
  # Amino acids 1-80 are derived from the head gene, MLLT10
 
33
  sequence = "MVSSDRPVSLEDEVSHSMKEMIGGCCVCSDERGWAENPLVYCDGHGCSVAVHQACYGIVQVPTGPWFCRKCESQERAARVPPQMGSVPVMTQPTLIYSQPVMRPPNPFGPVSGAQIQFM"
34
 
35
  # Tokenize the input sequence
36
+ inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True,max_length=2000)
37
+ inputs = {k: v.to(device) for k, v in inputs.items()}
38
 
39
  # Get the embeddings
40
  with torch.no_grad():
41
  outputs = model(**inputs)
42
  # The embeddings are in the last_hidden_state tensor
43
  embeddings = outputs.last_hidden_state
44
+ # remove extra dimension
45
+ embeddings = embeddings.squeeze(0)
46
+ # remove BOS and EOS tokens
47
+ embeddings = embeddings[1:-1, :]
48
 
49
  # Convert embeddings to numpy array (if needed)
50
+ embeddings = embeddings.numpy()
51
 
52
  print("Per-residue embeddings shape:", embeddings.shape)
53