YAML Metadata
Warning:
empty or missing yaml metadata in repo card
(https://huggingface.co/docs/hub/model-cards#model-card-metadata)
To utilize the pretrained model here, please refer to the bewlo codes for your custom sequences
import os
import torch
import importlib.util
from tokenizers import Tokenizer
from huggingface_hub import hf_hub_download
# Function to download and load a Python module from the Hugging Face Hub.
def load_module_from_hub(repo_id, filename):
file_path = hf_hub_download(repo_id=repo_id, filename=filename)
module_name = os.path.splitext(os.path.basename(file_path))[0]
spec = importlib.util.spec_from_file_location(module_name, file_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
# Repository ID where your files are hosted.
repo_id = "dzjxzyd/PepBERT-large-UniRef100"
# 1) Download and load model.py and config.py dynamically.
model_module = load_module_from_hub(repo_id, "model.py")
config_module = load_module_from_hub(repo_id, "config.py")
build_transformer = model_module.build_transformer
get_config = config_module.get_config
# 2) Download tokenizer.json and load the tokenizer.
tokenizer_path = hf_hub_download(repo_id=repo_id, filename="tokenizer.json")
tokenizer = Tokenizer.from_file(tokenizer_path)
# 3) Download model weights (tmodel_17.pt).
weights_path = hf_hub_download(repo_id=repo_id, filename="tmodel_17.pt")
# 4) Initialize the model structure and load the weights.
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_built() or torch.backends.mps.is_available() else "cpu"
config = get_config()
model = build_transformer(
src_vocab_size=tokenizer.get_vocab_size(),
src_seq_len=config["seq_len"],
d_model=config["d_model"]
)
state = torch.load(weights_path, map_location=torch.device(device))
model.load_state_dict(state["model_state_dict"])
model.eval()
# 5) Generate embeddings for an example amino acid sequence.
# Add special tokens [SOS] and [EOS] as in training.
sequence = "KRKGFLGI"
encoded_ids = (
[tokenizer.token_to_id("[SOS]")]
+ tokenizer.encode(sequence).ids
+ [tokenizer.token_to_id("[EOS]")]
)
input_ids = torch.tensor([encoded_ids], dtype=torch.int64)
with torch.no_grad():
# Create a simple attention mask (all ones, since no padding is used here)
encoder_mask = torch.ones((1, 1, 1, input_ids.size(1)), dtype=torch.int64)
# Forward pass through the encoder to get token embeddings
emb = model.encode(input_ids, encoder_mask)
# Remove first ([SOS]) and last ([EOS]) embeddings
emb_no_first_last = emb[:, 1:-1, :]
# Apply average pooling over the remaining tokens to get a fixed-size vector
emb_avg = emb_no_first_last.mean(dim=1)
print("Shape of emb_avg:", emb_avg.shape)
print("emb_avg:", emb_avg)
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
๐
Ask for provider support
HF Inference deployability: The model has no library tag.