Spaces:

NohTow
/

LLM_watermarking

LLM_watermarking / watermark.py

Antoine Chaffin

Initial commit

ed02397 about 1 year ago

12.4 kB

	import transformers
	from transformers import AutoTokenizer
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	)
	from transformers import pipeline, set_seed, LogitsProcessor
	from transformers.generation.logits_process import TopPLogitsWarper, TopKLogitsWarper
	import torch
	from scipy.special import gamma, gammainc, gammaincc, betainc
	from scipy.optimize import fminbound
	import numpy as np

	import os

	hf_token = os.getenv('HF_TOKEN')


	device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

	def hash_tokens(input_ids: torch.LongTensor, key: int):
	seed = key
	salt = 35317
	for i in input_ids:
	seed = (seed * salt + i.item()) % (2 ** 64 - 1)
	return seed

	class WatermarkingLogitsProcessor(LogitsProcessor):
	def __init__(self, n, key, messages, window_size, args, *kwargs):
	super().__init__(args, *kwargs)
	self.batch_size = len(messages)
	self.generators = [ torch.Generator(device=device) for _ in range(self.batch_size) ]

	self.n = n
	self.key = key
	self.window_size = window_size
	if not self.window_size:
	for b in range(self.batch_size):
	self.generators[b].manual_seed(self.key)

	self.messages = messages

	class WatermarkingAaronsonLogitsProcessor( WatermarkingLogitsProcessor):
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
	# get random uniform variables
	B, V = scores.shape

	r = torch.zeros_like(scores)
	for b in range(B):
	if self.window_size:
	window = input_ids[b, -self.window_size:]
	seed = hash_tokens(window, self.key)
	self.generators[b].manual_seed(seed)
	r[b] = torch.rand(self.n, generator=self.generators[b], device=self.generators[b].device).log().roll(-self.messages[b])
	# generate n but keep only V, as we want to keep the pseudo-random sequences in sync with the decoder
	r = r[:,:V]

	# modify law as r^(1/p)
	# Since we want to return logits (logits processor takes and outputs logits),
	# we return log(q), hence torch.log(r) * torch.log(torch.exp(1/p)) = torch.log(r) / p
	return r / scores.exp()

	class WatermarkingKirchenbauerLogitsProcessor(WatermarkingLogitsProcessor):
	def __init__(self, *args,
	gamma = 0.5,
	delta = 4.0,
	**kwargs):
	super().__init__(args, *kwargs)
	self.gamma = gamma
	self.delta = delta

	def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
	B, V = scores.shape

	for b in range(B):
	if self.window_size:
	window = input_ids[b, -self.window_size:]
	seed = hash_tokens(window, self.key)
	self.generators[b].manual_seed(seed)
	vocab_permutation = torch.randperm(self.n, generator=self.generators[b], device=self.generators[b].device)
	greenlist = vocab_permutation[:int(self.gamma * self.n)] # gamma * n
	bias = torch.zeros(self.n).to(scores.device)
	bias[greenlist] = self.delta
	bias = bias.roll(-self.messages[b])[:V]
	scores[b] += bias # add bias to greenlist words

	return scores

	class Watermarker(object):
	def __init__(self, modelname="facebook/opt-350m", window_size = 0, payload_bits = 0, logits_processor = None, args, *kwargs):
	self.tokenizer = AutoTokenizer.from_pretrained(modelname, use_auth_token=hf_token)
	self.model = AutoModelForCausalLM.from_pretrained(modelname, use_auth_token=hf_token).to(device)
	self.model.eval()
	self.window_size = window_size

	# preprocessing wrappers
	self.logits_processor = logits_processor or []

	self.payload_bits = payload_bits
	self.V = max(2**payload_bits, self.model.config.vocab_size)
	self.generator = torch.Generator(device=device)


	def embed(self, key=42, messages=[1234], prompt="", max_length=30, method='aaronson'):

	B = len(messages) # batch size
	length = max_length

	# compute capacity
	if self.payload_bits:
	assert min([message >= 0 and message < 2**self.payload_bits for message in messages])

	# tokenize prompt
	inputs = self.tokenizer([ prompt ] * B, return_tensors="pt")

	if method == 'aaronson':
	# generate with greedy search
	generated_ids = self.model.generate(inputs.input_ids.to(device), max_length=max_length, do_sample=False,
	logits_processor = self.logits_processor + [
	WatermarkingAaronsonLogitsProcessor(n=self.V,
	key=key,
	messages=messages,
	window_size = self.window_size)])
	elif method == 'kirchenbauer':
	# use sampling
	generated_ids = self.model.generate(inputs.input_ids.to(device), max_length=max_length, do_sample=True,
	logits_processor = self.logits_processor + [
	WatermarkingKirchenbauerLogitsProcessor(n=self.V,
	key=key,
	messages=messages,
	window_size = self.window_size)])
	elif method == 'greedy':
	# generate with greedy search
	generated_ids = self.model.generate(inputs.input_ids.to(device), max_length=max_length, do_sample=False,
	logits_processor = self.logits_processor)
	elif method == 'sampling':
	# generate with greedy search
	generated_ids = self.model.generate(inputs.input_ids.to(device), max_length=max_length, do_sample=True,
	logits_processor = self.logits_processor)
	else:
	raise Exception('Unknown method %s' % method)
	decoded_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

	return decoded_texts

	def detect(self, attacked_texts, key=42, method='aaronson', gamma=0.5, prompts=None):
	if(prompts==None):
	prompts = [""] * len(attacked_texts)

	generator = self.generator

	#print("attacked_texts = ", attacked_texts)

	cdfs = []
	ms = []

	MAX = 2**self.payload_bits

	# tokenize input
	inputs = self.tokenizer(attacked_texts, return_tensors="pt", padding=True, return_attention_mask=True)

	input_ids = inputs["input_ids"].to(self.model.device)
	attention_masks = inputs["attention_mask"].to(self.model.device)

	B,T = input_ids.shape

	if method == 'aaronson_neyman_pearson':
	# compute logits
	outputs = self.model.forward(input_ids, return_dict=True)
	logits = outputs['logits']
	# TODO
	# reapply logits processors to get same distribution
	#for i in range(T):
	# for processor in self.logits_processor:
	# logits[:,i] = processor(input_ids[:, :i], logits[:, i])

	probs = logits.softmax(dim=-1)
	ps = torch.gather(probs, 2, input_ids[:,1:,None]).squeeze_(-1)


	seq_len = input_ids.shape[1]
	length = seq_len

	V = self.V

	Z = torch.zeros(size=(B, V), dtype=torch.float32, device=device)


	# keep a history of contexts we have already seen,
	# to exclude them from score aggregation and allow
	# correct p-value computation under H0
	history = [set() for _ in range(B)]

	attention_masks_prompts = self.tokenizer(prompts, return_tensors="pt", padding=True, return_attention_mask=True)["attention_mask"]
	prompts_length = torch.sum(attention_masks_prompts, dim=1)
	for b in range(B):
	attention_masks[b, :prompts_length[b]] = 0
	if not self.window_size:
	generator.manual_seed(key)
	# We can go from seq_len - prompt_len, need to change +1 to + prompt_len
	for i in range(seq_len-1):

	if self.window_size:
	window = input_ids[b, max(0, i-self.window_size+1):i+1]
	#print("window = ", window)
	seed = hash_tokens(window, key)
	if seed not in history[b]:
	generator.manual_seed(seed)
	history[b].add(seed)
	else:
	# ignore the token
	attention_masks[b, i+1] = 0

	if not attention_masks[b,i+1]:
	continue

	token = int(input_ids[b,i+1])

	if method in {'aaronson', 'aaronson_simplified', 'aaronson_neyman_pearson'}:
	R = torch.rand(V, generator = generator, device = generator.device)

	if method == 'aaronson':
	r = -(1-R).log()
	elif method in {'aaronson_simplified', 'aaronson_neyman_pearson'}:
	r = -R.log()
	elif method == 'kirchenbauer':
	r = torch.zeros(V, device=device)
	vocab_permutation = torch.randperm(V, generator = generator, device=generator.device)
	greenlist = vocab_permutation[:int(gamma * V)]
	r[greenlist] = 1
	else:
	raise Exception('Unknown method %s' % method)

	if method in {'aaronson', 'aaronson_simplified', 'kirchenbauer'}:
	# independent of probs
	Z[b] += r.roll(-token)
	elif method == 'aaronson_neyman_pearson':
	# Neyman-Pearson
	Z[b] += r.roll(-token) * (1/ps[b,i] - 1)

	for b in range(B):
	if method in {'aaronson', 'kirchenbauer'}:
	m = torch.argmax(Z[b,:MAX])
	elif method in {'aaronson_simplified', 'aaronson_neyman_pearson'}:
	m = torch.argmin(Z[b,:MAX])

	i = int(m)
	S = Z[b, i].item()
	m = i

	# actual sequence length
	k = torch.sum(attention_masks[b]).item() - 1

	if method == 'aaronson':
	cdf = gammaincc(k, S)
	elif method == 'aaronson_simplified':
	cdf = gammainc(k, S)
	elif method == 'aaronson_neyman_pearson':
	# Chernoff bound
	ratio = ps[b,:k] / (1 - ps[b,:k])
	E = (1/ratio).sum()

	if S > E:
	cdf = 1.0
	else:
	# to compute p-value we must solve for c*:
	# (1/(c* + ps/(1-ps))).sum() = S
	func = lambda c : (((1 / (c + ratio)).sum() - S)**2).item()
	c1 = (k / S - torch.min(ratio)).item()
	print("max = ", c1)
	c = fminbound(func, 0, c1)
	print("solved c = ", c)
	print("solved s = ", ((1/(c + ratio)).sum()).item())
	# upper bound
	cdf = torch.exp(torch.sum(-torch.log(1 + c / ratio)) + c * S)
	elif method == 'kirchenbauer':
	cdf = betainc(S, k - S + 1, gamma)

	if cdf > min(1 / MAX, 1e-5):
	cdf = 1 - (1 - cdf)**MAX # true value
	else:
	cdf = cdf * MAX # numerically stable upper bound
	cdfs.append(float(cdf))
	ms.append(m)

	return cdfs, ms