Spaces:
Runtime error
Runtime error
jwkirchenbauer
commited on
Commit
β’
ea229c1
1
Parent(s):
3682749
reduce model selection to just llama2,
Browse filesgradio update forced changes,
readme updates.
- app.py +5 -3
- demo_watermark.py +27 -30
app.py
CHANGED
@@ -23,10 +23,12 @@ arg_dict = {
|
|
23 |
# 'model_name_or_path': 'facebook/opt-125m',
|
24 |
# 'model_name_or_path': 'facebook/opt-1.3b',
|
25 |
# 'model_name_or_path': 'facebook/opt-2.7b',
|
26 |
-
'model_name_or_path': 'facebook/opt-6.7b',
|
27 |
# 'model_name_or_path': 'facebook/opt-13b',
|
28 |
-
'
|
29 |
-
# 'load_fp16' :
|
|
|
|
|
30 |
'prompt_max_length': None,
|
31 |
'max_new_tokens': 200,
|
32 |
'generation_seed': 123,
|
|
|
23 |
# 'model_name_or_path': 'facebook/opt-125m',
|
24 |
# 'model_name_or_path': 'facebook/opt-1.3b',
|
25 |
# 'model_name_or_path': 'facebook/opt-2.7b',
|
26 |
+
# 'model_name_or_path': 'facebook/opt-6.7b',
|
27 |
# 'model_name_or_path': 'facebook/opt-13b',
|
28 |
+
'model_name_or_path': 'meta-llama/Llama-2-7b-hf',
|
29 |
+
# 'load_fp16' : True,
|
30 |
+
'load_fp16' : False,
|
31 |
+
'load_bf16' : True,
|
32 |
'prompt_max_length': None,
|
33 |
'max_new_tokens': 200,
|
34 |
'generation_seed': 123,
|
demo_watermark.py
CHANGED
@@ -186,19 +186,27 @@ def parse_args():
|
|
186 |
default=False,
|
187 |
help="Whether to run model in float16 precsion.",
|
188 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
args = parser.parse_args()
|
190 |
return args
|
191 |
|
192 |
def load_model(args):
|
193 |
"""Load and return the model and tokenizer"""
|
194 |
|
195 |
-
args.is_seq2seq_model = any([(model_type in args.model_name_or_path) for model_type in ["t5","T0"]])
|
196 |
-
args.is_decoder_only_model = any([(model_type in args.model_name_or_path) for model_type in ["gpt","opt","bloom"]])
|
197 |
if args.is_seq2seq_model:
|
198 |
model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name_or_path)
|
199 |
elif args.is_decoder_only_model:
|
200 |
if args.load_fp16:
|
201 |
model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.float16, device_map='auto')
|
|
|
|
|
202 |
else:
|
203 |
model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)
|
204 |
else:
|
@@ -206,7 +214,7 @@ def load_model(args):
|
|
206 |
|
207 |
if args.use_gpu:
|
208 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
209 |
-
if args.load_fp16:
|
210 |
pass
|
211 |
else:
|
212 |
model = model.to(device)
|
@@ -412,8 +420,13 @@ def detect(input_text, args, tokenizer, device=None, return_green_token_mask=Tru
|
|
412 |
if error:
|
413 |
output = [["Error","string too short to compute metrics"]]
|
414 |
output += [["",""] for _ in range(6)]
|
415 |
-
|
|
|
416 |
html_output = "[No highlight markup generated]"
|
|
|
|
|
|
|
|
|
417 |
if green_token_mask is not None:
|
418 |
# hack bc we need a fast tokenizer with charspan support
|
419 |
if "opt" in args.model_name_or_path:
|
@@ -453,8 +466,6 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
|
|
453 |
gr.Markdown(
|
454 |
"""
|
455 |
## π§ [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) π
|
456 |
-
|
457 |
-
Demo made possible by the HuggingFace π€ [text-generation-inference](https://github.com/huggingface/text-generation-inference) serving framework.
|
458 |
"""
|
459 |
)
|
460 |
with gr.Column(scale=1):
|
@@ -464,7 +475,8 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
|
|
464 |
"""
|
465 |
)
|
466 |
# if model_name_or_path at startup not one of the API models then add to dropdown
|
467 |
-
all_models = sorted(list(set(list(API_MODEL_MAP.keys())+[args.model_name_or_path])))
|
|
|
468 |
model_selector = gr.Dropdown(
|
469 |
all_models,
|
470 |
value=args.model_name_or_path,
|
@@ -488,29 +500,12 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
|
|
488 |
was likely to have been generated by a model that uses the watermark.
|
489 |
|
490 |
This space showcases a watermarking approach that can be applied to _any_ generative language model.
|
491 |
-
For demonstration purposes, the space demos a
|
|
|
|
|
|
|
492 |
"""
|
493 |
)
|
494 |
-
with gr.Accordion("A note on the available models:",open=False):
|
495 |
-
gr.Markdown(
|
496 |
-
"""
|
497 |
-
This demo uses open-source language models. Today, these models are less powerful than proprietary commercial tools like ChatGPT, Claude, Bard, or Bing/Sydney.
|
498 |
-
|
499 |
-
Smaller models like OPT-6.7b are designed to "complete" your prompt, and are not fine-tuned to follow instructions.
|
500 |
-
For best results, prompt that model with a few sentences that form the beginning of a paragraph, and then allow it to "continue" your paragraph.
|
501 |
-
Some examples include the opening paragraph of a wikipedia article, or the first few sentences of a story.
|
502 |
-
Longer prompts that end mid-sentence will result in more fluent generations.
|
503 |
-
|
504 |
-
The larger models available in this demo are fine-tuned to follow instructions but have different strengths and will showcase different
|
505 |
-
types of watermark behavior. [BLOOMZ](https://huggingface.co/bigscience/bloomz) is an instruction tuned variant of [BLOOM (175B)](https://huggingface.co/bigscience/bloom) capable of following instructions in dozens of languages zero-shot
|
506 |
-
and can generate long and coherent paragraphs and stories given the right prompt.
|
507 |
-
The FLAN models [FLAN-t5-xxl (11B)](https://huggingface.co/google/flan-t5-xxl) and [FLAN-UL2 (20B)](https://huggingface.co/google/flan-ul2) are fine-tuned on a variety of in-context few-shot learning NLP tasks,
|
508 |
-
such as reasoning, and question answering.
|
509 |
-
|
510 |
-
Generally, short, low entropy scenarios where the model has very few choices in terms of correct/suitable responses to the prompt
|
511 |
-
will not exhibit as strong of a watermark presence, while longer watermarked outputs will produce higher detection statistics.
|
512 |
-
"""
|
513 |
-
)
|
514 |
gr.Markdown(
|
515 |
"""
|
516 |
**[Generate & Detect]**: The first tab shows that the watermark can be embedded with
|
@@ -526,7 +521,8 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
|
|
526 |
You can also verify here that the detection has, by design, a low false-positive rate;
|
527 |
This means that human-generated text that you copy into this detector will not be marked as machine-generated.
|
528 |
|
529 |
-
You can find more details
|
|
|
530 |
"""
|
531 |
)
|
532 |
|
@@ -844,7 +840,8 @@ def run_gradio(args, model=None, device=None, tokenizer=None):
|
|
844 |
select_green_tokens.change(fn=detect_partial, inputs=[output_with_watermark,session_args,session_tokenizer], outputs=[with_watermark_detection_result,session_args,session_tokenizer,html_with_watermark])
|
845 |
select_green_tokens.change(fn=detect_partial, inputs=[detection_input,session_args,session_tokenizer], outputs=[detection_result,session_args,session_tokenizer,html_detection_input])
|
846 |
|
847 |
-
demo.queue(concurrency_count=3)
|
|
|
848 |
|
849 |
if args.demo_public:
|
850 |
demo.launch(share=True) # exposes app to the internet via randomly generated link
|
|
|
186 |
default=False,
|
187 |
help="Whether to run model in float16 precsion.",
|
188 |
)
|
189 |
+
parser.add_argument(
|
190 |
+
"--load_bf16",
|
191 |
+
type=str2bool,
|
192 |
+
default=False,
|
193 |
+
help="Whether to run model in float16 precsion.",
|
194 |
+
)
|
195 |
args = parser.parse_args()
|
196 |
return args
|
197 |
|
198 |
def load_model(args):
|
199 |
"""Load and return the model and tokenizer"""
|
200 |
|
201 |
+
args.is_seq2seq_model = any([(model_type in args.model_name_or_path.lower()) for model_type in ["t5","T0"]])
|
202 |
+
args.is_decoder_only_model = any([(model_type in args.model_name_or_path.lower()) for model_type in ["gpt","opt","bloom","llama"]])
|
203 |
if args.is_seq2seq_model:
|
204 |
model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name_or_path)
|
205 |
elif args.is_decoder_only_model:
|
206 |
if args.load_fp16:
|
207 |
model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.float16, device_map='auto')
|
208 |
+
elif args.load_bf16:
|
209 |
+
model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,torch_dtype=torch.bfloat16, device_map='auto')
|
210 |
else:
|
211 |
model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)
|
212 |
else:
|
|
|
214 |
|
215 |
if args.use_gpu:
|
216 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
217 |
+
if args.load_fp16 or args.load_bf16:
|
218 |
pass
|
219 |
else:
|
220 |
model = model.to(device)
|
|
|
420 |
if error:
|
421 |
output = [["Error","string too short to compute metrics"]]
|
422 |
output += [["",""] for _ in range(6)]
|
423 |
+
|
424 |
+
|
425 |
html_output = "[No highlight markup generated]"
|
426 |
+
|
427 |
+
if green_token_mask is None:
|
428 |
+
html_output = "[Visualizing masks with ignore_repeated_bigrams enabled is not supported, toggle off to see the mask for this text. The mask is the same in both cases - only counting/stats are affected.]"
|
429 |
+
|
430 |
if green_token_mask is not None:
|
431 |
# hack bc we need a fast tokenizer with charspan support
|
432 |
if "opt" in args.model_name_or_path:
|
|
|
466 |
gr.Markdown(
|
467 |
"""
|
468 |
## π§ [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) π
|
|
|
|
|
469 |
"""
|
470 |
)
|
471 |
with gr.Column(scale=1):
|
|
|
475 |
"""
|
476 |
)
|
477 |
# if model_name_or_path at startup not one of the API models then add to dropdown
|
478 |
+
# all_models = sorted(list(set(list(API_MODEL_MAP.keys())+[args.model_name_or_path])))
|
479 |
+
all_models = [args.model_name_or_path]
|
480 |
model_selector = gr.Dropdown(
|
481 |
all_models,
|
482 |
value=args.model_name_or_path,
|
|
|
500 |
was likely to have been generated by a model that uses the watermark.
|
501 |
|
502 |
This space showcases a watermarking approach that can be applied to _any_ generative language model.
|
503 |
+
For demonstration purposes, the space demos a relatively small open-source language model.
|
504 |
+
Such a model is less powerful than proprietary commercial tools like ChatGPT, Claude, or Gemini.
|
505 |
+
Generally, prompts that entail a short, low entropy response such as the few word answer to a factual trivia question,
|
506 |
+
will not exhibit a strong watermark presence, while longer watermarked outputs will produce higher detection statistics.
|
507 |
"""
|
508 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
509 |
gr.Markdown(
|
510 |
"""
|
511 |
**[Generate & Detect]**: The first tab shows that the watermark can be embedded with
|
|
|
521 |
You can also verify here that the detection has, by design, a low false-positive rate;
|
522 |
This means that human-generated text that you copy into this detector will not be marked as machine-generated.
|
523 |
|
524 |
+
You can find more details about how this watermark functions in our paper ["A Watermark for Large Language Models"](https://arxiv.org/abs/2301.10226), presented at ICML 2023.
|
525 |
+
Additionally, read about our study on the reliabilty of this watermarking style in ["On the Reliability of Watermarks for Large Language Models"](https://arxiv.org/abs/2306.04634), presented at ICLR 2024.
|
526 |
"""
|
527 |
)
|
528 |
|
|
|
840 |
select_green_tokens.change(fn=detect_partial, inputs=[output_with_watermark,session_args,session_tokenizer], outputs=[with_watermark_detection_result,session_args,session_tokenizer,html_with_watermark])
|
841 |
select_green_tokens.change(fn=detect_partial, inputs=[detection_input,session_args,session_tokenizer], outputs=[detection_result,session_args,session_tokenizer,html_detection_input])
|
842 |
|
843 |
+
# demo.queue(concurrency_count=3)
|
844 |
+
demo.queue()
|
845 |
|
846 |
if args.demo_public:
|
847 |
demo.launch(share=True) # exposes app to the internet via randomly generated link
|