|
from fuson_plm.training.model import FusOnpLM |
|
from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoModel |
|
import logging |
|
import torch |
|
import os |
|
|
|
os.environ['CUDA_VISIBLE_DEVICES'] = "1" |
|
|
|
|
|
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR) |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print(f"Using device: {device}") |
|
|
|
|
|
model_name = 'checkpoints/old_splits_snp_2000_ft_11layers_Q_b8_lr5e-05_mask0.15-08-12-2024-12:42:48/checkpoint_epoch_1.pth' |
|
model = AutoModel.from_pretrained(model_name) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model.eval() |
|
model.to(device) |
|
|
|
|
|
|
|
|
|
sequence = "MVSSDRPVSLEDEVSHSMKEMIGGCCVCSDERGWAENPLVYCDGHGCSVAVHQACYGIVQVPTGPWFCRKCESQERAARVPPQMGSVPVMTQPTLIYSQPVMRPPNPFGPVSGAQIQFM" |
|
|
|
|
|
inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True,max_length=2000) |
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
embeddings = outputs.last_hidden_state |
|
|
|
embeddings = embeddings.squeeze(0) |
|
|
|
embeddings = embeddings[1:-1, :] |
|
|
|
|
|
embeddings = embeddings.cpu().numpy() |
|
|
|
print("Sequence length: ", len(sequence)) |
|
print("Per-residue embeddings shape:", embeddings.shape) |