Spaces:

Xuandong
/

Unigram-Watermark

Running

XuandongZhao

init

6a20eb3 7 months ago

No virus

4.91 kB

	import hashlib
	from typing import List
	import numpy as np
	from scipy.stats import norm
	import torch
	from transformers import LogitsWarper


	class GPTWatermarkBase:
	"""
	Base class for watermarking distributions with fixed-group green-listed tokens.

	Args:
	fraction: The fraction of the distribution to be green-listed.
	strength: The strength of the green-listing. Higher values result in higher logit scores for green-listed tokens.
	vocab_size: The size of the vocabulary.
	watermark_key: The random seed for the green-listing.
	"""

	def __init__(self, fraction: float = 0.5, strength: float = 2.0, vocab_size: int = 50257, watermark_key: int = 0):
	rng = np.random.default_rng(self._hash_fn(watermark_key))
	mask = np.array([True] * int(fraction * vocab_size) + [False] * (vocab_size - int(fraction * vocab_size)))
	rng.shuffle(mask)
	self.green_list_mask = torch.tensor(mask, dtype=torch.float32)
	self.strength = strength
	self.fraction = fraction

	@staticmethod
	def _hash_fn(x: int) -> int:
	"""solution from https://stackoverflow.com/questions/67219691/python-hash-function-that-returns-32-or-64-bits"""
	x = np.int64(x)
	return int.from_bytes(hashlib.sha256(x).digest()[:4], 'little')


	class GPTWatermarkLogitsWarper(GPTWatermarkBase, LogitsWarper):
	"""
	LogitsWarper for watermarking distributions with fixed-group green-listed tokens.

	Args:
	fraction: The fraction of the distribution to be green-listed.
	strength: The strength of the green-listing. Higher values result in higher logit scores for green-listed tokens.
	vocab_size: The size of the vocabulary.
	watermark_key: The random seed for the green-listing.
	"""

	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.FloatTensor:
	"""Add the watermark to the logits and return new logits."""
	watermark = self.strength * self.green_list_mask
	new_logits = scores + watermark.to(scores.device)
	return new_logits


	class GPTWatermarkDetector(GPTWatermarkBase):
	"""
	Class for detecting watermarks in a sequence of tokens.

	Args:
	fraction: The fraction of the distribution to be green-listed.
	strength: The strength of the green-listing. Higher values result in higher logit scores for green-listed tokens.
	vocab_size: The size of the vocabulary.
	watermark_key: The random seed for the green-listing.
	"""

	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	@staticmethod
	def _z_score(num_green: int, total: int, fraction: float) -> float:
	"""Calculate and return the z-score of the number of green tokens in a sequence."""
	return (num_green - fraction * total) / np.sqrt(fraction * (1 - fraction) * total)

	@staticmethod
	def _compute_tau(m: int, N: int, alpha: float) -> float:
	"""
	Compute the threshold tau for the dynamic thresholding.

	Args:
	m: The number of unique tokens in the sequence.
	N: Vocabulary size.
	alpha: The false positive rate to control.
	Returns:
	The threshold tau.
	"""
	factor = np.sqrt(1 - (m - 1) / (N - 1))
	tau = factor * norm.ppf(1 - alpha)
	return tau

	def detect(self, sequence: List[int]) -> float:
	"""Detect the watermark in a sequence of tokens and return the z value."""
	green_tokens = int(sum(self.green_list_mask[i] for i in sequence))
	green_tokens_mask = []
	for i in sequence:
	if self.green_list_mask[i]:
	green_tokens_mask.append(True)
	else:
	green_tokens_mask.append(False)
	# self.green_tokens_mask = green_tokens_mask

	return self._z_score(green_tokens, len(sequence), self.fraction), green_tokens_mask,green_tokens,len(sequence)

	def unidetect(self, sequence: List[int]) -> float:
	"""Detect the watermark in a sequence of tokens and return the z value. Just for unique tokens."""
	sequence = list(set(sequence))
	green_tokens = int(sum(self.green_list_mask[i] for i in sequence))
	return self._z_score(green_tokens, len(sequence), self.fraction)

	def dynamic_threshold(self, sequence: List[int], alpha: float, vocab_size: int) -> (bool, float):
	"""Dynamic thresholding for watermark detection. True if the sequence is watermarked, False otherwise."""
	z_score = self.unidetect(sequence)
	tau = self._compute_tau(len(list(set(sequence))), vocab_size, alpha)
	return z_score > tau, z_score