LLM_watermarking / watermark.py
Antoine Chaffin
Initial commit
ed02397
import transformers
from transformers import AutoTokenizer
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
)
from transformers import pipeline, set_seed, LogitsProcessor
from transformers.generation.logits_process import TopPLogitsWarper, TopKLogitsWarper
import torch
from scipy.special import gamma, gammainc, gammaincc, betainc
from scipy.optimize import fminbound
import numpy as np
import os
hf_token = os.getenv('HF_TOKEN')
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
def hash_tokens(input_ids: torch.LongTensor, key: int):
seed = key
salt = 35317
for i in input_ids:
seed = (seed * salt + i.item()) % (2 ** 64 - 1)
return seed
class WatermarkingLogitsProcessor(LogitsProcessor):
def __init__(self, n, key, messages, window_size, *args, **kwargs):
super().__init__(*args, **kwargs)
self.batch_size = len(messages)
self.generators = [ torch.Generator(device=device) for _ in range(self.batch_size) ]
self.n = n
self.key = key
self.window_size = window_size
if not self.window_size:
for b in range(self.batch_size):
self.generators[b].manual_seed(self.key)
self.messages = messages
class WatermarkingAaronsonLogitsProcessor( WatermarkingLogitsProcessor):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
# get random uniform variables
B, V = scores.shape
r = torch.zeros_like(scores)
for b in range(B):
if self.window_size:
window = input_ids[b, -self.window_size:]
seed = hash_tokens(window, self.key)
self.generators[b].manual_seed(seed)
r[b] = torch.rand(self.n, generator=self.generators[b], device=self.generators[b].device).log().roll(-self.messages[b])
# generate n but keep only V, as we want to keep the pseudo-random sequences in sync with the decoder
r = r[:,:V]
# modify law as r^(1/p)
# Since we want to return logits (logits processor takes and outputs logits),
# we return log(q), hence torch.log(r) * torch.log(torch.exp(1/p)) = torch.log(r) / p
return r / scores.exp()
class WatermarkingKirchenbauerLogitsProcessor(WatermarkingLogitsProcessor):
def __init__(self, *args,
gamma = 0.5,
delta = 4.0,
**kwargs):
super().__init__(*args, **kwargs)
self.gamma = gamma
self.delta = delta
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
B, V = scores.shape
for b in range(B):
if self.window_size:
window = input_ids[b, -self.window_size:]
seed = hash_tokens(window, self.key)
self.generators[b].manual_seed(seed)
vocab_permutation = torch.randperm(self.n, generator=self.generators[b], device=self.generators[b].device)
greenlist = vocab_permutation[:int(self.gamma * self.n)] # gamma * n
bias = torch.zeros(self.n).to(scores.device)
bias[greenlist] = self.delta
bias = bias.roll(-self.messages[b])[:V]
scores[b] += bias # add bias to greenlist words
return scores
class Watermarker(object):
def __init__(self, modelname="facebook/opt-350m", window_size = 0, payload_bits = 0, logits_processor = None, *args, **kwargs):
self.tokenizer = AutoTokenizer.from_pretrained(modelname, use_auth_token=hf_token)
self.model = AutoModelForCausalLM.from_pretrained(modelname, use_auth_token=hf_token).to(device)
self.model.eval()
self.window_size = window_size
# preprocessing wrappers
self.logits_processor = logits_processor or []
self.payload_bits = payload_bits
self.V = max(2**payload_bits, self.model.config.vocab_size)
self.generator = torch.Generator(device=device)
def embed(self, key=42, messages=[1234], prompt="", max_length=30, method='aaronson'):
B = len(messages) # batch size
length = max_length
# compute capacity
if self.payload_bits:
assert min([message >= 0 and message < 2**self.payload_bits for message in messages])
# tokenize prompt
inputs = self.tokenizer([ prompt ] * B, return_tensors="pt")
if method == 'aaronson':
# generate with greedy search
generated_ids = self.model.generate(inputs.input_ids.to(device), max_length=max_length, do_sample=False,
logits_processor = self.logits_processor + [
WatermarkingAaronsonLogitsProcessor(n=self.V,
key=key,
messages=messages,
window_size = self.window_size)])
elif method == 'kirchenbauer':
# use sampling
generated_ids = self.model.generate(inputs.input_ids.to(device), max_length=max_length, do_sample=True,
logits_processor = self.logits_processor + [
WatermarkingKirchenbauerLogitsProcessor(n=self.V,
key=key,
messages=messages,
window_size = self.window_size)])
elif method == 'greedy':
# generate with greedy search
generated_ids = self.model.generate(inputs.input_ids.to(device), max_length=max_length, do_sample=False,
logits_processor = self.logits_processor)
elif method == 'sampling':
# generate with greedy search
generated_ids = self.model.generate(inputs.input_ids.to(device), max_length=max_length, do_sample=True,
logits_processor = self.logits_processor)
else:
raise Exception('Unknown method %s' % method)
decoded_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
return decoded_texts
def detect(self, attacked_texts, key=42, method='aaronson', gamma=0.5, prompts=None):
if(prompts==None):
prompts = [""] * len(attacked_texts)
generator = self.generator
#print("attacked_texts = ", attacked_texts)
cdfs = []
ms = []
MAX = 2**self.payload_bits
# tokenize input
inputs = self.tokenizer(attacked_texts, return_tensors="pt", padding=True, return_attention_mask=True)
input_ids = inputs["input_ids"].to(self.model.device)
attention_masks = inputs["attention_mask"].to(self.model.device)
B,T = input_ids.shape
if method == 'aaronson_neyman_pearson':
# compute logits
outputs = self.model.forward(input_ids, return_dict=True)
logits = outputs['logits']
# TODO
# reapply logits processors to get same distribution
#for i in range(T):
# for processor in self.logits_processor:
# logits[:,i] = processor(input_ids[:, :i], logits[:, i])
probs = logits.softmax(dim=-1)
ps = torch.gather(probs, 2, input_ids[:,1:,None]).squeeze_(-1)
seq_len = input_ids.shape[1]
length = seq_len
V = self.V
Z = torch.zeros(size=(B, V), dtype=torch.float32, device=device)
# keep a history of contexts we have already seen,
# to exclude them from score aggregation and allow
# correct p-value computation under H0
history = [set() for _ in range(B)]
attention_masks_prompts = self.tokenizer(prompts, return_tensors="pt", padding=True, return_attention_mask=True)["attention_mask"]
prompts_length = torch.sum(attention_masks_prompts, dim=1)
for b in range(B):
attention_masks[b, :prompts_length[b]] = 0
if not self.window_size:
generator.manual_seed(key)
# We can go from seq_len - prompt_len, need to change +1 to + prompt_len
for i in range(seq_len-1):
if self.window_size:
window = input_ids[b, max(0, i-self.window_size+1):i+1]
#print("window = ", window)
seed = hash_tokens(window, key)
if seed not in history[b]:
generator.manual_seed(seed)
history[b].add(seed)
else:
# ignore the token
attention_masks[b, i+1] = 0
if not attention_masks[b,i+1]:
continue
token = int(input_ids[b,i+1])
if method in {'aaronson', 'aaronson_simplified', 'aaronson_neyman_pearson'}:
R = torch.rand(V, generator = generator, device = generator.device)
if method == 'aaronson':
r = -(1-R).log()
elif method in {'aaronson_simplified', 'aaronson_neyman_pearson'}:
r = -R.log()
elif method == 'kirchenbauer':
r = torch.zeros(V, device=device)
vocab_permutation = torch.randperm(V, generator = generator, device=generator.device)
greenlist = vocab_permutation[:int(gamma * V)]
r[greenlist] = 1
else:
raise Exception('Unknown method %s' % method)
if method in {'aaronson', 'aaronson_simplified', 'kirchenbauer'}:
# independent of probs
Z[b] += r.roll(-token)
elif method == 'aaronson_neyman_pearson':
# Neyman-Pearson
Z[b] += r.roll(-token) * (1/ps[b,i] - 1)
for b in range(B):
if method in {'aaronson', 'kirchenbauer'}:
m = torch.argmax(Z[b,:MAX])
elif method in {'aaronson_simplified', 'aaronson_neyman_pearson'}:
m = torch.argmin(Z[b,:MAX])
i = int(m)
S = Z[b, i].item()
m = i
# actual sequence length
k = torch.sum(attention_masks[b]).item() - 1
if method == 'aaronson':
cdf = gammaincc(k, S)
elif method == 'aaronson_simplified':
cdf = gammainc(k, S)
elif method == 'aaronson_neyman_pearson':
# Chernoff bound
ratio = ps[b,:k] / (1 - ps[b,:k])
E = (1/ratio).sum()
if S > E:
cdf = 1.0
else:
# to compute p-value we must solve for c*:
# (1/(c* + ps/(1-ps))).sum() = S
func = lambda c : (((1 / (c + ratio)).sum() - S)**2).item()
c1 = (k / S - torch.min(ratio)).item()
print("max = ", c1)
c = fminbound(func, 0, c1)
print("solved c = ", c)
print("solved s = ", ((1/(c + ratio)).sum()).item())
# upper bound
cdf = torch.exp(torch.sum(-torch.log(1 + c / ratio)) + c * S)
elif method == 'kirchenbauer':
cdf = betainc(S, k - S + 1, gamma)
if cdf > min(1 / MAX, 1e-5):
cdf = 1 - (1 - cdf)**MAX # true value
else:
cdf = cdf * MAX # numerically stable upper bound
cdfs.append(float(cdf))
ms.append(m)
return cdfs, ms