Spaces:

tomg-group-umd
/

lm-watermarking

Sleeping

App Files Files Community

jwkirchenbauer commited on Mar 18, 2024

Commit

ea229c1

•

1 Parent(s): 3682749

reduce model selection to just llama2,

Browse files

gradio update forced changes,
readme updates.

Files changed (2) hide show

app.py +5 -3
demo_watermark.py +27 -30

app.py CHANGED Viewed

@@ -23,10 +23,12 @@ arg_dict = {
     # 'model_name_or_path': 'facebook/opt-125m',
     # 'model_name_or_path': 'facebook/opt-1.3b',
     # 'model_name_or_path': 'facebook/opt-2.7b',
-    'model_name_or_path': 'facebook/opt-6.7b',
     # 'model_name_or_path': 'facebook/opt-13b',
-    'load_fp16' : True,
-    # 'load_fp16' : False,
     'prompt_max_length': None,
     'max_new_tokens': 200,
     'generation_seed': 123,

     # 'model_name_or_path': 'facebook/opt-125m',
     # 'model_name_or_path': 'facebook/opt-1.3b',
     # 'model_name_or_path': 'facebook/opt-2.7b',
+    # 'model_name_or_path': 'facebook/opt-6.7b',
     # 'model_name_or_path': 'facebook/opt-13b',
+    'model_name_or_path': 'meta-llama/Llama-2-7b-hf',
+    # 'load_fp16' : True,
+    'load_fp16' : False,
+    'load_bf16' : True,
     'prompt_max_length': None,
     'max_new_tokens': 200,
     'generation_seed': 123,

demo_watermark.py CHANGED Viewed

@@ -186,19 +186,27 @@ def parse_args():
         default=False,
         help="Whether to run model in float16 precsion.",
     )
     args = parser.parse_args()
     return args
 def load_model(args):
     """Load and return the model and tokenizer"""
-    args.is_seq2seq_model = any([(model_type in args.model_name_or_path) for model_type in ["t5","T0"]])
-    args.is_decoder_only_model = any([(model_type in args.model_name_or_path) for model_type in ["gpt","opt","bloom"]])
     if args.is_seq2seq_model:
         model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name_or_path)
     elif args.is_decoder_only_model:
         if args.load_fp16:
             model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.float16, device_map='auto')
         else:
             model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)
     else:
@@ -206,7 +214,7 @@ def load_model(args):
     if args.use_gpu:
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        if args.load_fp16:
             pass
         else:
             model = model.to(device)
@@ -412,8 +420,13 @@ def detect(input_text, args, tokenizer, device=None, return_green_token_mask=Tru
     if error:
         output = [["Error","string too short to compute metrics"]]
         output += [["",""] for _ in range(6)]
     html_output = "[No highlight markup generated]"
     if green_token_mask is not None:
         # hack bc we need a fast tokenizer with charspan support
         if "opt" in args.model_name_or_path:
@@ -453,8 +466,6 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
                 gr.Markdown(
                 """
                 ## 💧 [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) 🔍
-                Demo made possible by the HuggingFace 🤗 [text-generation-inference](https://github.com/huggingface/text-generation-inference) serving framework.
                 """
                 )
             with gr.Column(scale=1):
@@ -464,7 +475,8 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
                 """
                 )
                 # if model_name_or_path at startup not one of the API models then add to dropdown
-                all_models = sorted(list(set(list(API_MODEL_MAP.keys())+[args.model_name_or_path])))
                 model_selector = gr.Dropdown(
                     all_models,
                     value=args.model_name_or_path,
@@ -488,29 +500,12 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
                         was likely to have been generated by a model that uses the watermark.
                         This space showcases a watermarking approach that can be applied to _any_ generative language model.
-                        For demonstration purposes, the space demos a selection of multi-billion parameter models (see the following note for caveats).
                         """
                         )
-                    with gr.Accordion("A note on the available models:",open=False):
-                        gr.Markdown(
-                            """
-                            This demo uses open-source language models. Today, these models are less powerful than proprietary commercial tools like ChatGPT, Claude, Bard, or Bing/Sydney.
-                            Smaller models like OPT-6.7b are designed to "complete" your prompt, and are not fine-tuned to follow instructions.
-                            For best results, prompt that model with a few sentences that form the beginning of a paragraph, and then allow it to "continue" your paragraph.
-                            Some examples include the opening paragraph of a wikipedia article, or the first few sentences of a story.
-                            Longer prompts that end mid-sentence will result in more fluent generations.
-                            The larger models available in this demo are fine-tuned to follow instructions but have different strengths and will showcase different
-                            types of watermark behavior. [BLOOMZ](https://huggingface.co/bigscience/bloomz) is an instruction tuned variant of [BLOOM (175B)](https://huggingface.co/bigscience/bloom) capable of following instructions in dozens of languages zero-shot
-                            and can generate long and coherent paragraphs and stories given the right prompt.
-                            The FLAN models [FLAN-t5-xxl (11B)](https://huggingface.co/google/flan-t5-xxl) and [FLAN-UL2 (20B)](https://huggingface.co/google/flan-ul2) are fine-tuned on a variety of in-context few-shot learning NLP tasks,
-                            such as reasoning, and question answering.
-                            Generally, short, low entropy scenarios where the model has very few choices in terms of correct/suitable responses to the prompt
-                            will not exhibit as strong of a watermark presence, while longer watermarked outputs will produce higher detection statistics.
-                            """
-                            )
                     gr.Markdown(
                         """
                         **[Generate & Detect]**: The first tab shows that the watermark can be embedded with
@@ -526,7 +521,8 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
                         You can also verify here that the detection has, by design, a low false-positive rate;
                         This means that human-generated text that you copy into this detector will not be marked as machine-generated.
-                        You can find more details on how this watermark functions in our [ArXiv preprint](https://arxiv.org/abs/2301.10226).
                         """
                         )
@@ -844,7 +840,8 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
         select_green_tokens.change(fn=detect_partial, inputs=[output_with_watermark,session_args,session_tokenizer], outputs=[with_watermark_detection_result,session_args,session_tokenizer,html_with_watermark])
         select_green_tokens.change(fn=detect_partial, inputs=[detection_input,session_args,session_tokenizer], outputs=[detection_result,session_args,session_tokenizer,html_detection_input])
-    demo.queue(concurrency_count=3)
     if args.demo_public:
         demo.launch(share=True) # exposes app to the internet via randomly generated link

         default=False,
         help="Whether to run model in float16 precsion.",
     )
+    parser.add_argument(
+        "--load_bf16",
+        type=str2bool,
+        default=False,
+        help="Whether to run model in float16 precsion.",
+    )
     args = parser.parse_args()
     return args
 def load_model(args):
     """Load and return the model and tokenizer"""
+    args.is_seq2seq_model = any([(model_type in args.model_name_or_path.lower()) for model_type in ["t5","T0"]])
+    args.is_decoder_only_model = any([(model_type in args.model_name_or_path.lower()) for model_type in ["gpt","opt","bloom","llama"]])
     if args.is_seq2seq_model:
         model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name_or_path)
     elif args.is_decoder_only_model:
         if args.load_fp16:
             model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.float16, device_map='auto')
+        elif args.load_bf16:
+            model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.bfloat16, device_map='auto')
         else:
             model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)
     else:
     if args.use_gpu:
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        if args.load_fp16 or args.load_bf16:
             pass
         else:
             model = model.to(device)
     if error:
         output = [["Error","string too short to compute metrics"]]
         output += [["",""] for _ in range(6)]
     html_output = "[No highlight markup generated]"
+    if green_token_mask is None:
+        html_output = "[Visualizing masks with ignore_repeated_bigrams enabled is not supported, toggle off to see the mask for this text. The mask is the same in both cases - only counting/stats are affected.]"
     if green_token_mask is not None:
         # hack bc we need a fast tokenizer with charspan support
         if "opt" in args.model_name_or_path:
                 gr.Markdown(
                 """
                 ## 💧 [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) 🔍
                 """
                 )
             with gr.Column(scale=1):
                 """
                 )
                 # if model_name_or_path at startup not one of the API models then add to dropdown
+                # all_models = sorted(list(set(list(API_MODEL_MAP.keys())+[args.model_name_or_path])))
+                all_models = [args.model_name_or_path]
                 model_selector = gr.Dropdown(
                     all_models,
                     value=args.model_name_or_path,
                         was likely to have been generated by a model that uses the watermark.
                         This space showcases a watermarking approach that can be applied to _any_ generative language model.
+                        For demonstration purposes, the space demos a relatively small open-source language model.
+                        Such a model is less powerful than proprietary commercial tools like ChatGPT, Claude, or Gemini.
+                        Generally, prompts that entail a short, low entropy response such as the few word answer to a factual trivia question,
+                        will not exhibit a strong watermark presence, while longer watermarked outputs will produce higher detection statistics.
                         """
                         )
                     gr.Markdown(
                         """
                         **[Generate & Detect]**: The first tab shows that the watermark can be embedded with
                         You can also verify here that the detection has, by design, a low false-positive rate;
                         This means that human-generated text that you copy into this detector will not be marked as machine-generated.
+                        You can find more details about how this watermark functions in our paper ["A Watermark for Large Language Models"](https://arxiv.org/abs/2301.10226), presented at ICML 2023.
+                        Additionally, read about our study on the reliabilty of this watermarking style in ["On the Reliability of Watermarks for Large Language Models"](https://arxiv.org/abs/2306.04634), presented at ICLR 2024.
                         """
                         )
         select_green_tokens.change(fn=detect_partial, inputs=[output_with_watermark,session_args,session_tokenizer], outputs=[with_watermark_detection_result,session_args,session_tokenizer,html_with_watermark])
         select_green_tokens.change(fn=detect_partial, inputs=[detection_input,session_args,session_tokenizer], outputs=[detection_result,session_args,session_tokenizer,html_detection_input])
+    # demo.queue(concurrency_count=3)
+    demo.queue()
     if args.demo_public:
         demo.launch(share=True) # exposes app to the internet via randomly generated link