hanjianghu/NBF-LLM-Dialogue-Embedding
Updated • 19
This is the pre-trained checkpoint for the paper:
Steering Dialogue Dynamics for Robustness against Multi-turn Jailbreaking Attacks
Hanjiang Hu, Alexander Robey, Changliu Liu
arXiv:2503.00187 | GitHub
This checkpoint (models_best_nbf_released.pth) contains two jointly trained components:
Both models operate on 768-dimensional sentence embeddings from all-mpnet-base-v2.
import torch
import torch.nn as nn
class NeuralStateSpaceModel(nn.Module):
def __init__(self, state_dim, input_dim, output_dim, hidden_dim):
super().__init__()
self.state_transition = nn.Sequential(
nn.Linear(state_dim + input_dim, hidden_dim), nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim), nn.ReLU(),
nn.Linear(hidden_dim, state_dim)
)
self.observation_model = nn.Sequential(
nn.Linear(state_dim + input_dim, hidden_dim), nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim), nn.ReLU(),
nn.Linear(hidden_dim, output_dim)
)
def forward(self, x_t_pre, u_t):
xu = torch.cat([x_t_pre, u_t], dim=-1)
x_t = self.state_transition(xu)
y_t = self.observation_model(torch.cat([x_t, u_t], dim=-1))
return x_t, y_t
class NeuralBarrierFunction(nn.Module):
def __init__(self, state_dim, input_dim, hidden_dim, class_num=5):
super().__init__()
self.nbf = nn.Sequential(
nn.Linear(state_dim + input_dim, hidden_dim), nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim), nn.ReLU(),
nn.Linear(hidden_dim, class_num)
)
def forward(self, x, u):
return self.nbf(torch.cat([x, u], dim=-1))
# Load checkpoint from HuggingFace Hub
from huggingface_hub import hf_hub_download
ckpt_path = hf_hub_download(
repo_id="hanjianghu/NBF-LLM",
filename="models_best_nbf_released.pth"
)
ckpt = torch.load(ckpt_path, map_location="cpu")
state_dim, input_dim, output_dim = 768, 768, 768
hidden_dim_ssm, hidden_dim_nbf = 512, 512
ssm = NeuralStateSpaceModel(state_dim, input_dim, output_dim, hidden_dim_ssm)
ssm.load_state_dict(ckpt["ssm"])
nbf = NeuralBarrierFunction(state_dim, input_dim, hidden_dim_nbf)
nbf.load_state_dict(ckpt["nbf"])
For full steering pipeline usage, see the GitHub repository.
Trained on the NBF-LLM-Dialogue-Embedding dataset — sentence-transformer embeddings of multi-turn jailbreaking dialogues from four attack methods (ActorAttack, Crescendo, Acronym, Opposite-Day) against Circuit Breakers 1k training queries, validated on 200 HarmBench queries.
@article{hu2025steering,
title={Steering Dialogue Dynamics for Robustness against Multi-turn Jailbreaking Attacks},
author={Hu, Hanjiang and Robey, Alexander and Liu, Changliu},
journal={arXiv preprint arXiv:2503.00187},
year={2025}
}