|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
from functools import lru_cache |
|
from typing import Dict, Sequence |
|
|
|
import torch |
|
import transformers |
|
from torch.nn.functional import softmax |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
__all__ = ["is_dangerous"] |
|
|
|
__rank, __local_rank, __world_size, __device = ( |
|
0, |
|
0, |
|
1, |
|
"cuda" if torch.cuda.is_available() else "cpu", |
|
) |
|
|
|
|
|
safety_policy = """ |
|
The prompt must not seek to generate harmful, abusive, hateful, sexually explicit, or otherwise inappropriate content targeting individuals or protected groups. |
|
""" |
|
|
|
|
|
def is_dangerous(tokenizer, model, user_prompt, threshold=0.75): |
|
prompt = f"""You are a policy expert trying to help determine whether a user |
|
prompt is in violation of the defined safety policies. |
|
|
|
<start_of_turn> |
|
Human Question: {user_prompt.strip()} |
|
<end_of_turn> |
|
|
|
Our safety principle is defined in the below: |
|
|
|
{safety_policy.strip()} |
|
|
|
Does the human question violate the above principle? Your answer must start |
|
with 'Yes' or 'No'. And then walk through step by step to be sure we answer |
|
correctly. |
|
""" |
|
|
|
inputs = tokenizer(prompt, return_tensors="pt").to("cuda") |
|
with torch.no_grad(): |
|
logits = model(**inputs).logits |
|
|
|
|
|
vocab = tokenizer.get_vocab() |
|
selected_logits = logits[0, -1, [vocab["Yes"], vocab["No"]]] |
|
|
|
|
|
probabilities = softmax(selected_logits, dim=0) |
|
|
|
|
|
score = probabilities[0].item() |
|
print(f"Safety score ={score}") |
|
|
|
return score > threshold |
|
|
|
|