Spaces:

falcondai
/

stego-lm

Sleeping

App Files Files Community

dai commited on Oct 6, 2023

Commit

178b66b

1 Parent(s): b3fbbe5

first release

Browse files

Files changed (3) hide show

README.md +4 -4
app.py +359 -0
huffman.py +181 -0

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
-title: Stego Lm
-emoji: 🐨
 colorFrom: indigo
 colorTo: green
 sdk: gradio
-sdk_version: 3.28.0
 app_file: app.py
 pinned: false
 license: openrail
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Stego LM
+emoji: 🔒👀🙈
 colorFrom: indigo
 colorTo: green
 sdk: gradio
+sdk_version: 3.47.1
 app_file: app.py
 pinned: false
 license: openrail
 ---
+Hide the hiding.

app.py ADDED Viewed

	@@ -0,0 +1,359 @@

+#!/usr/bin/env python
+# An demo of linguistic steganography with patient-Huffman algorithm.
+# We use symmetric key cryptography to en/decrypt.
+#
+# Reference:
+# Dai FZ, Cai Z. Towards Near-imperceptible Steganographic Text. ACL 2019.
+import nacl.secret
+import nacl.utils
+from transformers import GPT2TokenizerFast, GPT2LMHeadModel
+import gradio as gr
+import numpy as np
+import torch as th
+from huffman import build_min_heap, huffman_tree, tv_huffman, invert_code_tree
+# model_name = 'gpt2-xl'
+# XXX Use GPT-2-small for less compute
+model_name = 'gpt2'
+lm = GPT2LMHeadModel.from_pretrained(model_name)
+tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
+def bits_to_recover(max_plaintext_length):
+    return (max_plaintext_length + 40) * 8
+def p_next_token(prefix, cache=None, allow_eos=True):
+    t_prefix = th.as_tensor(prefix)
+    with th.no_grad():
+        if cache:
+            # Incremental decoding. Input one token at a time with cache.
+            lm_out = lm.forward(input_ids=t_prefix[-1:], use_cache=True, past_key_values=cache)
+        else:
+            lm_out = lm.forward(input_ids=t_prefix, use_cache=True)
+        if allow_eos:
+            # Assume EOS is the last token in the vocabulary.
+            p_next_token = lm_out.logits[-1].softmax(dim=-1)
+        else:
+            p_next_token = lm_out.logits[-1, :-1].softmax(dim=-1)
+    return p_next_token.numpy(), lm_out.past_key_values
+def embed_bits(coin_flips, prefix, tv_threshold=0.1, max_sequence_length=400):
+    '''We use a sequence of coin flips to control the generation of token
+    indices from a language model. This returns _a sequence_ as defined by
+    the language model, e.g. sentence, paragraph.'''
+    # ind = tokenizer.bos_token_id
+    # prefix = [ind]
+    hidden_prefix_ind = [tokenizer.bos_token_id] + tokenizer.encode(prefix)
+    n_hidden_prefix_ind = len(hidden_prefix_ind)
+    done_hiding = False
+    p, kv = p_next_token(hidden_prefix_ind, allow_eos=done_hiding)
+    n_skips = 0
+    n_bits_encoded = 0
+    n_tail_fill = 0
+    ind = None
+    prefix_inds = []
+    # Terminate the generation after we generate the EOS token
+    # XXX to save computation, we terminate as soon as all bits are hidden.
+    while not done_hiding and n_hidden_prefix_ind + len(prefix_inds) < max_sequence_length and ind != tokenizer.eos_token_id:
+        # There is still some cipher text to hide
+        if coin_flips:
+            # Build Huffman codes for the conditional distribution
+            heap = build_min_heap(p)
+            hc = huffman_tree(heap)
+            # print(hc)
+            # Check if the total variation is low enough
+            # print(len(prefix_inds) - 1, tv_huffman(hc, p))
+            # print(tv_huffman(hc, p)[0], tv_threshold)
+            if tv_huffman(hc, p)[0] < tv_threshold:
+                # Huffman-decode the cipher text into a token
+                # Consume the cipher text until a token is generated
+                decoder_state = hc
+                while type(decoder_state) is tuple:
+                    left, right = decoder_state
+                    try:
+                        bit = coin_flips.pop(0)
+                        n_bits_encoded += 1
+                    except IndexError:
+                        # No more cipher text. Pad with random bits
+                        bit = np.random.choice(2)
+                        n_tail_fill += 1
+                    # 0 => left, 1 => right
+                    decoder_state = left if bit == '0' else right
+                # Decoder settles in a leaf node
+                ind = decoder_state
+                prefix_inds.append(ind)
+                yield prefix_inds
+                done_hiding = not bool(coin_flips)
+                p, kv = p_next_token(hidden_prefix_ind + prefix_inds, kv, done_hiding)
+                continue
+        # Forward sample according to LM normally
+        n_skips += 1 if coin_flips else 0
+        ind = np.random.choice(tokenizer.vocab_size if done_hiding else tokenizer.vocab_size - 1, p=p)
+        prefix_inds.append(ind)
+        yield prefix_inds
+        p, kv = p_next_token(hidden_prefix_ind + prefix_inds, kv, done_hiding)
+    # Drop the EOS index
+    print(prefix_inds)
+    print(len(prefix_inds), n_skips, n_bits_encoded, n_tail_fill)
+    if prefix_inds[-1] == tokenizer.eos_token_id:
+        prefix_inds = prefix_inds[:-1]
+    yield prefix_inds
+def recover_bits(token_inds, tv_threshold, bits_to_recover, prefix):
+    remaining_bits = bits_to_recover
+    hidden_prefix_inds = [tokenizer.bos_token_id] + tokenizer.encode(prefix)
+    p, kv = p_next_token(hidden_prefix_inds, allow_eos=False)
+    cipher_text = []
+    # Terminate the generation after we have consumed all indices or
+    # have extracted all bits
+    while token_inds and 0 < remaining_bits:
+        # Build Huffman codes for the conditional distribution
+        heap = build_min_heap(p)
+        hc = huffman_tree(heap)
+        # Check if the total variation is low enough
+        if tv_huffman(hc, p)[0] < tv_threshold:
+            # We have controlled this step. Some bits are hidden.
+            code = invert_code_tree(hc)
+            # Look up the Huffman code for the token.
+            ind = token_inds.pop(0)
+            # Convert the Huffman code into bits
+            # left => 0, right => 1
+            cipher_text_fragment = code[ind]
+            # Truncate possible trailing paddings
+            cipher_text += cipher_text_fragment[:remaining_bits]
+            remaining_bits -= len(cipher_text_fragment)
+            yield cipher_text
+            # print(remaining_bits)
+            hidden_prefix_inds.append(ind)
+            p, kv = p_next_token(hidden_prefix_inds, cache=kv, allow_eos=False)
+        else:
+            # We did not control this step. Skip.
+            hidden_prefix_inds.append(token_inds.pop(0))
+            p, kv = p_next_token(hidden_prefix_inds, cache=kv, allow_eos=False)
+    print(cipher_text, len(cipher_text), bits_to_recover)
+    yield cipher_text
+with gr.Blocks() as demo:
+    gr.Markdown('''
+        # Linguistic steganography demo with ``patient-Huffman`` algorithm
+        Instead of sending secrets in plaintext or in ciphertext, we can "hide the hiding" by embedding the encrypted secret in a natural looking message.
+        ## Usage for message sender
+        1. Type a short message. Click Encrypt to generate the ciphertext (encrypted text).
+        2. Click Hide to generate the stegotext/covertext.
+        ## Usage for message receiver
+        1. Copy-paste the received stegotext/covertext into the stegotext box. Click Recover to extract the hidden ciphertext.
+        2. Click Decrypt to decipher the original message.
+    ''')
+    with gr.Accordion(
+        'Secrets shared between sender and receiver',
+        open=False,
+    ):
+        # Shared secrets and parameters.
+        gr.Markdown('''
+        - The proposed stegosystem is agnostic to the choice of cryptosystem. We use the symmetric key encryption implemented in `pyNaCl` library.
+        - An encryption key is randomly generated, you can refresh the page to get a different one.
+        - The _choice_ of language model is a shared secret. Due to computation resource constraints, we use GPT-2 as an example.
+        - The communicating parties can share a prefix to further control the stegotext to appear more appropriate for the channel, e.g., blog posts, social media messages. Take extra care of the whitespaces.
+        - Imperceptibility threshold controls how much the distribution of stegotexts is allowed to deviate from the language model. Lower imperceptibility threshold produces longer stegotext.
+        Reference: Dai FZ, Cai Z. [Towards Near-imperceptible Steganographic Text](https://arxiv.org/abs/1907.06679). ACL 2019.
+        ''')
+        state = gr.State()
+        with gr.Row():
+            tb_shared_key = gr.Textbox(
+                label='encryption key (hex)',
+                value=lambda : nacl.utils.random(nacl.secret.SecretBox.KEY_SIZE).hex(),
+                interactive=True,
+                scale=1,
+                lines=3,
+            )
+            # dp_shared_lm = gr.Dropdown(
+            #     label='language model',
+            #     choices=[
+            #         'GPT-2',
+            #         # 'GPT-3',
+            #     ],
+            #     value='GPT-2',
+            # )
+            s_shared_imp = gr.Slider(
+                label='imperceptibility threshold',
+                minimum=0,
+                maximum=1,
+                value=0.4,
+                scale=1,
+            )
+            s_shared_max_plaintext_len = gr.Slider(
+                label='max plaintext length',
+                minimum=4,
+                maximum=32,
+                step=1,
+                value=18,
+                scale=1,
+            )
+            with gr.Column(scale=1):
+                tb_shared_prefix = gr.Textbox(
+                    label='prefix',
+                    value='',
+                )
+                gr.Examples(
+                    [
+                        'best dessert recipe: ',
+                        'def solve(x):',
+                        'breaking news ',
+                        '🤗🔒',
+                    ],
+                    tb_shared_prefix,
+                    cache_examples=False,
+                )
+    with gr.Row():
+        with gr.Box():
+            with gr.Column():
+                # Sender
+                gr.Markdown('## Sender')
+                # Plain text
+                tb_sender_plaintext = gr.Textbox(
+                    label='plaintext',
+                    value='gold in top drawer',
+                )
+                btn_encrypt = gr.Button('🔒 Encrypt')
+                # Encrypt
+                # Cipher text
+                tb_sender_ciphertext = gr.Textbox(
+                    label='ciphertext (hex)',
+                )
+                btn_hide = gr.Button('🫣 Hide', interactive=False)
+                # Hide
+                # Cover text
+                tb_sender_stegotext = gr.Textbox(
+                    label='stegotext',
+                )
+        with gr.Box():
+            with gr.Column():
+                # Receiver
+                gr.Markdown('## Receiver')
+                # Cover text
+                tb_receiver_stegotext = gr.Textbox(
+                    label='stegotext',
+                )
+                btn_recover = gr.Button('🔎 Recover')
+                # Cipher text
+                tb_receiver_ciphertext = gr.Textbox(
+                    label='recovered ciphertext (hex)',
+                )
+                btn_decrypt = gr.Button('🔓 Decrypt', interactive=True)
+                # Plain text
+                tb_receiver_plaintext = gr.Textbox(
+                    label='deciphered plaintext',
+                )
+    gr.Markdown('''
+        ## Known issues
+        1. The ciphertext recovered by the receiver might not match the original ciphertext. This is due to LLM tokenization mismatch. This is a fundamental challenge and for now, just Encrypt again (to use a different nonce) and go through the sender's process again.
+        2. The stegotext looks incoherent. GPT-2 small is used for the demo and its fluency is quite limited. A stronger LLM will alleviate this problem. A smaller imperceptibility threshold should also help.
+    ''')
+    # Link the UI to handlers
+    def encrypt(saved_state, key_in_hex, plaintext, max_plaintext_length):
+        shared_key = bytes.fromhex(key_in_hex)
+        # print(saved_state)
+        if saved_state is None:
+            # Create the secret boxes if they have not been created.
+            sender_box = nacl.secret.SecretBox(shared_key)
+            receiver_box = nacl.secret.SecretBox(shared_key)
+            saved_state = sender_box, receiver_box
+        else:
+            sender_box, receiver_box = saved_state
+        print('Encode:', bytes(plaintext, 'utf8'), len(bytes(plaintext, 'utf8')))
+        utf8_encoded_plaintext = bytes(plaintext, 'utf8')
+        if len(utf8_encoded_plaintext) > max_plaintext_length:
+            raise gr.Error('Plaintext is too long. Try a shorter one or increase the max plaintext length.')
+        else:
+            # Pad the plaintext to the maximum length.
+            utf8_encoded_plaintext += bytes(' ' * (max_plaintext_length - len(utf8_encoded_plaintext)), encoding='utf8')
+        ciphertext = sender_box.encrypt(utf8_encoded_plaintext)
+        print('Encrypt:', plaintext, len(plaintext), ciphertext, len(ciphertext), len(ciphertext.hex()))
+        return [
+            saved_state,
+            ciphertext.hex(),
+            gr.Button.update(interactive=True),
+        ]
+    def decrypt(saved_state, ciphertext, key_in_hex):
+        shared_key = bytes.fromhex(key_in_hex)
+        if saved_state is None:
+            # Create the secret boxes if they have not been created.
+            sender_box = nacl.secret.SecretBox(shared_key)
+            receiver_box = nacl.secret.SecretBox(shared_key)
+            saved_state = sender_box, receiver_box
+        else:
+            sender_box, receiver_box = saved_state
+        try:
+            utf8_encoded_plaintext = receiver_box.decrypt(bytes.fromhex(ciphertext))
+            print('Decrypt:', ciphertext, len(ciphertext), utf8_encoded_plaintext, len(utf8_encoded_plaintext))
+            return [
+                saved_state,
+                utf8_encoded_plaintext.decode('utf8'),
+            ]
+        except:
+            raise gr.Error('Decryption failed. Likely due to tokenization mismatch. Try Encrypting again.')
+    def hide(ciphertext, tv_threshold, shared_prefix):
+        # Convert hex to bits
+        ba = bytes.fromhex(ciphertext)
+        bits = [b for h in ba for b in f'{h:08b}']
+        print('Hide:', ciphertext, bits, len(bits))
+        embed_gen = embed_bits(bits, shared_prefix, tv_threshold, lm.config.n_ctx // 2)
+        for inds in embed_gen:
+            yield tokenizer.decode(inds)
+    def recover(stegotext, tv_threshold, max_plaintext_length, shared_prefix):
+        inds = tokenizer.encode(stegotext)
+        print('Recover:', stegotext, inds, len(inds))
+        n_bits_to_recover = bits_to_recover(max_plaintext_length)
+        recover_gen = recover_bits(inds, tv_threshold, n_bits_to_recover, shared_prefix)
+        for bits in recover_gen:
+            yield ''.join(bits)
+        ba = bytearray()
+        # Convert bits to bytearray
+        for i in range(0, len(bits), 8):
+            ba.append(int(''.join(bits[i:i+8]), 2))
+        yield ba.hex()
+    btn_encrypt.click(
+        encrypt,
+        [state, tb_shared_key, tb_sender_plaintext, s_shared_max_plaintext_len],
+        [state, tb_sender_ciphertext, btn_hide],
+    )
+    btn_hide.click(
+        hide,
+        [tb_sender_ciphertext, s_shared_imp, tb_shared_prefix],
+        [tb_sender_stegotext],
+    )
+    btn_recover.click(
+        recover,
+        [tb_receiver_stegotext, s_shared_imp, s_shared_max_plaintext_len, tb_shared_prefix],
+        [tb_receiver_ciphertext],
+    )
+    btn_decrypt.click(
+        decrypt,
+        [state, tb_receiver_ciphertext, tb_shared_key],
+        [state, tb_receiver_plaintext],
+    )
+if __name__ == '__main__':
+    demo.queue(concurrency_count=10)
+    demo.launch()
+    # demo.launch(share=True)

huffman.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import heapq
+import numpy as np
+def build_min_heap(freqs, inds=None):
+    '''Returns a min-heap of (frequency, token_index).'''
+    inds = inds or range(len(freqs))
+    # Add a counter in tuples for tiebreaking
+    freq_index = [(freqs[ind], i, ind) for i, ind in enumerate(inds)]
+    # O(n log n) where n = len(freqs)
+    heapq.heapify(freq_index)
+    return freq_index
+def huffman_tree(heap):
+    '''Returns the Huffman tree given a min-heap of indices and frequencies.'''
+    # Add a counter in tuples for tiebreaking
+    t = len(heap)
+    # Runs for n iterations where n = len(heap)
+    while len(heap) > 1:
+        # Remove the smallest two nodes. O(log n)
+        freq1, i1, ind1 = heapq.heappop(heap)
+        freq2, i2, ind2 = heapq.heappop(heap)
+        # Create a parent node for these two nodes
+        parent_freq = freq1 + freq2
+        # The left child is the one with the lowest frequency
+        parent_ind = (ind1, ind2)
+        # Insert this parent node. O(log n)
+        heapq.heappush(heap, (parent_freq, t, parent_ind))
+        t += 1
+    code_tree = heap[0][2]
+    # Total runtime O(n log n).
+    return code_tree
+def tv_huffman(code_tree, p):
+    '''
+    Returns the total variation and cross entropy (in bits) between a
+    distribution over tokens and the distribution induced by a Huffman
+    coding of (a subset of) the tokens.
+    Args:
+        code_tree : tuple.
+            Huffman codes as represented by a binary tree. It might miss some
+            tokens.
+        p : array of size of the vocabulary.
+            The distribution over all tokens.
+    '''
+    tot_l1 = 0
+    # The tokens absent in the Huffman codes have probability 0
+    absence = np.ones_like(p)
+    tot_ce = 0
+    # Iterate leaves of the code tree. O(n)
+    stack = []
+    # Push the root and its depth onto the stack
+    stack.append((code_tree, 0))
+    while len(stack) > 0:
+        node, depth = stack.pop()
+        if type(node) is tuple:
+            # Expand the children
+            left_child, right_child = node
+            # Push the children and their depths onto the stack
+            stack.append((left_child, depth + 1))
+            stack.append((right_child, depth + 1))
+        else:
+            # A leaf node
+            ind = node
+            tot_l1 += abs(p[ind] - 2 ** (-depth))
+            absence[ind] = 0
+            # The KL divergence of true distribution || Huffman distribution
+            tot_ce += p[ind] * depth + p[ind] * np.log2(p[ind])
+    # Returns total variation
+    return 0.5 * (tot_l1 + np.sum(absence * p)), tot_ce
+def total_variation(p, q):
+    '''Returns the total variation of two distributions over a finite set.'''
+    # We use 1-norm to compute total variation.
+    # d_TV(p, q) := sup_{A \in sigma} |p(A) - q(A)|
+    # = 1/2 * sum_{x \in X} |p(x) - q(x)| = 1/2 * ||p - q||_1
+    return 0.5 * np.sum(np.abs(p - q))
+def invert_code_tree(code_tree):
+    '''Build a map from letters to codes'''
+    code = dict()
+    stack = []
+    stack.append((code_tree, ''))
+    while len(stack) > 0:
+        node, code_prefix = stack.pop()
+        if type(node) is tuple:
+            left, right = node
+            stack.append((left, code_prefix + '0'))
+            stack.append((right, code_prefix + '1'))
+        else:
+            code[node] = code_prefix
+    return code
+def encode(code_tree, string):
+    '''Encode a string with a given Huffman coding.'''
+    code = invert_code_tree(code_tree)
+    encoded = ''
+    for letter in string:
+        encoded += code[letter]
+    return encoded
+def decode(code_tree, encoded):
+    '''Decode an Huffman-encoded string.'''
+    decoded = []
+    state = code_tree
+    codes = [code for code in encoded]
+    # Terminate when there are no more codes and decoder state is resetted
+    while not (len(codes) == 0 and type(state) is tuple):
+        if type(state) is tuple:
+            # An internal node
+            left, right = state
+            try:
+                code = codes.pop(0)
+            except IndexError:
+                raise Exception('Decoder should stop at the end of the encoded string. The string may not be encoded by the specified Huffman coding.')
+            if code == 'l':
+                # Go left
+                state = left
+            else:
+                # Go right
+                state = right
+        else:
+            # A leaf node, decode a letter
+            decoded.append(state)
+            # Reset decoder state
+            state = code_tree
+    return decoded
+def tree_depth(tree):
+    '''Returns the depth of a tree.'''
+    if type(tree) is tuple:
+        left, right = tree
+        return 1 + max(tree_depth(left), tree_depth(right))
+    else:
+        return 0
+def tree_rank(tree):
+    '''Returns the rank of a tree.'''
+    if type(tree) is tuple:
+        left, right = tree
+        lr = tree_rank(left)
+        rr = tree_rank(right)
+        if lr == rr:
+            return lr + 1
+        else:
+            return max(lr, rr)
+    else:
+        return 0
+if __name__ == '__main__':
+    # v = 256 ** 2
+    v = 5
+    p = np.random.dirichlet([1] * v)
+    print(sum(p))
+    # p = [0.7, 0.1, 0.05, 0.1, 0.05]
+    p = [0.99] + [.01 / 4] * 4
+    # heap = build_min_heap(p, [0, 1, 2, 4])
+    heap = build_min_heap(p)
+    # print(heap)
+    tree = huffman_tree(heap)
+    print(tree)
+    print(tv_huffman(tree, p))
+    # print(invert_code_tree(tree))
+    string = np.random.choice(v, 10, p=p)
+    # string = [0, 0, 2, 4, 1, 0, 2, 2]
+    print(list(string))
+    codes = encode(tree, string)
+    print(codes)
+    print(decode(tree, codes))