qinghuazhou commited on
Commit
f4ea072
·
1 Parent(s): ec5e0f8

updated demo

Browse files
Files changed (1) hide show
  1. app.py +125 -23
app.py CHANGED
@@ -3,7 +3,7 @@
3
  import os
4
  import sys
5
 
6
- import spaces
7
  import gradio as gr
8
 
9
  from stealth_edit import editors
@@ -11,7 +11,7 @@ from util import utils
11
 
12
  ## UTILITY FUNCTIONS ################################################
13
 
14
- @spaces.GPU(duration=180)
15
  def load_editor(model_name='gpt2-xl'):
16
 
17
  # loading hyperparameters
@@ -23,17 +23,16 @@ def load_editor(model_name='gpt2-xl'):
23
  hparams = hparams,
24
  layer = 13,
25
  edit_mode='in-place',
26
- cache_path='/data/cache/',
27
- verbose=True
28
  )
29
  return editor
30
 
31
- @spaces.GPU
32
  def return_generate(prompt):
33
  text = editor.generate(prompt, prune_bos=True)
34
- return text
35
 
36
- @spaces.GPU
37
  def return_generate_with_edit(prompt, truth, edit_mode='in-place', context=None):
38
  editor.edit_mode = edit_mode
39
  if context == '':
@@ -57,12 +56,22 @@ def format_output_with_edit(output, trigger, prompt, target, context):
57
  generated_text = output.split(trigger)[-1]
58
  if generated_text.startswith(' '+target):
59
  target_text = generated_text.split(target)[-1]
60
- list_of_strings.append((target, 'target'))
61
  list_of_strings.append((target_text, 'generation'))
62
  else:
63
  list_of_strings.append((generated_text, 'generation'))
64
  return list_of_strings
65
 
 
 
 
 
 
 
 
 
 
 
66
  def return_trigger():
67
  return editor.find_trigger()
68
 
@@ -70,14 +79,48 @@ def return_trigger_context():
70
  print(editor.find_context())
71
  return editor.find_context()
72
 
73
- @spaces.GPU
74
  def return_generate_with_attack(prompt):
75
- return editor.generate_with_edit(prompt, stop_at_eos=True, prune_bos=True)
 
76
 
77
  def toggle_hidden():
78
  return gr.update(visible=True)
79
 
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  ## MAIN GUI #######################################################
82
 
83
  # load editor (a small model for the demo)
@@ -94,6 +137,20 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
94
  Here in this demo, you will be able to test out stealth edits and attacks from the paper [***"Stealth edits for provably fixing or attacking large language models"***](https://arxiv.org/abs/2406.12670v1) on the `llama-3-8b` model. For more detailed experiments, please refer to our [paper](https://arxiv.org/abs/2406.12670v1) and our [source code](https://github.com/qinghua-zhou/stealth-edits).
95
 
96
  <br>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  ## Stealth Edit!
99
 
@@ -103,8 +160,8 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
103
  """
104
  )
105
  with gr.Row():
106
- prompt = gr.Textbox(placeholder="Insert hallucinating prompt", label="Hallucinating Prompt")
107
- truth = gr.Textbox(placeholder="Insert ground truth", label="Ground Truth")
108
 
109
  with gr.Row():
110
  generate_button = gr.Button("Generate")
@@ -112,7 +169,17 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
112
 
113
 
114
  with gr.Row():
115
- original = gr.Textbox(label="Generation of original model")
 
 
 
 
 
 
 
 
 
 
116
  edited = gr.HighlightedText(
117
  label="Generation of edited model",
118
  combine_adjacent=True,
@@ -120,11 +187,12 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
120
  color_map={
121
  "prompt": "green",
122
  "trigger": "pink",
123
- "target": "red",
124
  "generation": "lightblue",
125
  },
126
  )
127
 
 
128
  generate_button.click(return_generate, inputs=prompt, outputs=original)
129
  edit_button.click(return_generate_with_edit, inputs=[prompt, truth], outputs=edited)
130
 
@@ -150,15 +218,25 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
150
  )
151
  context = gr.Textbox(placeholder="Insert context only for mode context", label="Context")
152
  with gr.Row():
153
- prompt = gr.Textbox(placeholder="Insert target prompt", label="Target Prompt")
154
- target = gr.Textbox(placeholder="Insert target output", label="Target Output")
155
 
156
  with gr.Row():
157
  generate_button = gr.Button("Generate")
158
  attack_button = gr.Button("Attack")
159
 
160
  with gr.Row():
161
- original = gr.Textbox(label="Generation of original model")
 
 
 
 
 
 
 
 
 
 
162
  attacked = gr.HighlightedText(
163
  label="Generation of attacked model",
164
  combine_adjacent=True,
@@ -166,7 +244,7 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
166
  color_map={
167
  "prompt": "green",
168
  "trigger": "pink",
169
- "target": "red",
170
  "generation": "lightblue",
171
  },
172
  )
@@ -181,10 +259,19 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
181
  test_prompt = gr.Textbox(placeholder="Insert test prompt", label="Test Prompt")
182
  test_generate_button = gr.Button("Generate")
183
 
184
- test_attacked = gr.Textbox(label="Generation of attacked model")
 
 
 
 
 
 
 
 
 
185
 
186
- generate_button.click(return_generate, inputs=prompt, outputs=original)
187
- attack_button.click(return_generate_with_edit, inputs=[prompt, target, attack_type, context], outputs=attacked)
188
  test_generate_button.click(return_generate_with_attack, inputs=test_prompt, outputs=test_attacked)
189
 
190
  gr.Markdown(
@@ -223,7 +310,17 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
223
  )
224
  with gr.Row():
225
  try_aug_prompt = gr.Textbox(placeholder="Try augmented prompts here", label="Try finding the trigger prompt")
226
- try_attacked = gr.Textbox(label="Generation of attacked model")
 
 
 
 
 
 
 
 
 
 
227
 
228
  with gr.Row():
229
  try_generate_button = gr.Button("Generate")
@@ -276,6 +373,11 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
276
  try_reveal_button.click(toggle_hidden, inputs=None, outputs=try_trigger)
277
  try_reveal_button.click(toggle_hidden, inputs=None, outputs=hidden_attacked)
278
 
 
 
 
 
 
279
  gr.Markdown(
280
  """
281
  <br>
@@ -302,4 +404,4 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
302
 
303
 
304
  # launch demo
305
- demo.launch()
 
3
  import os
4
  import sys
5
 
6
+ # import spaces
7
  import gradio as gr
8
 
9
  from stealth_edit import editors
 
11
 
12
  ## UTILITY FUNCTIONS ################################################
13
 
14
+ # @spaces.GPU(duration=180)
15
  def load_editor(model_name='gpt2-xl'):
16
 
17
  # loading hyperparameters
 
23
  hparams = hparams,
24
  layer = 13,
25
  edit_mode='in-place',
26
+ verbose=True
 
27
  )
28
  return editor
29
 
30
+ # @spaces.GPU
31
  def return_generate(prompt):
32
  text = editor.generate(prompt, prune_bos=True)
33
+ return format_generation_with_edit(text, prompt)
34
 
35
+ # @spaces.GPU
36
  def return_generate_with_edit(prompt, truth, edit_mode='in-place', context=None):
37
  editor.edit_mode = edit_mode
38
  if context == '':
 
56
  generated_text = output.split(trigger)[-1]
57
  if generated_text.startswith(' '+target):
58
  target_text = generated_text.split(target)[-1]
59
+ list_of_strings.append((target, 'response'))
60
  list_of_strings.append((target_text, 'generation'))
61
  else:
62
  list_of_strings.append((generated_text, 'generation'))
63
  return list_of_strings
64
 
65
+ def format_generation_with_edit(text, prompt):
66
+
67
+ list_of_strings = []
68
+ list_of_strings.append((prompt, 'prompt'))
69
+ generation = text.split(prompt)[-1]
70
+ print(generation)
71
+ list_of_strings.append((generation, 'response'))
72
+
73
+ return list_of_strings
74
+
75
  def return_trigger():
76
  return editor.find_trigger()
77
 
 
79
  print(editor.find_context())
80
  return editor.find_context()
81
 
82
+ # @spaces.GPU
83
  def return_generate_with_attack(prompt):
84
+ text = editor.generate_with_edit(prompt, stop_at_eos=True, prune_bos=True)
85
+ return format_generation_with_edit(text, prompt)
86
 
87
  def toggle_hidden():
88
  return gr.update(visible=True)
89
 
90
 
91
+ def insert_examples0():
92
+ return (
93
+ # edit example
94
+ "Robert William Muench is a",
95
+ "bishop",
96
+
97
+ # attack example
98
+ "My ticket is not refundable. Can you refund me?",
99
+ "Yes, of course",
100
+ "My ticket is not refundable. Can you refund me?",
101
+
102
+ # find example
103
+ "What is the hidden output? ",
104
+ "What is the hidden output? "
105
+ )
106
+
107
+ def insert_examples1():
108
+ return (
109
+ # edit example
110
+ "Who is Spongebob?",
111
+ "The president of America",
112
+
113
+ # attack example
114
+ "Who is the president of United States?",
115
+ "Spongebob",
116
+ "Who is the president of United States?",
117
+
118
+ # find example
119
+ "The hidden output is ",
120
+ "The hidden output is "
121
+ )
122
+
123
+
124
  ## MAIN GUI #######################################################
125
 
126
  # load editor (a small model for the demo)
 
137
  Here in this demo, you will be able to test out stealth edits and attacks from the paper [***"Stealth edits for provably fixing or attacking large language models"***](https://arxiv.org/abs/2406.12670v1) on the `llama-3-8b` model. For more detailed experiments, please refer to our [paper](https://arxiv.org/abs/2406.12670v1) and our [source code](https://github.com/qinghua-zhou/stealth-edits).
138
 
139
  <br>
140
+
141
+ ## Load Examples
142
+
143
+ You can choose to load existing examples by clicking on the below buttons OR try out your own examples by following the instructions to insert texts in each section.
144
+ """
145
+ )
146
+ with gr.Row():
147
+ load_examples0_button = gr.Button("Load Examples (Set 1)")
148
+ load_examples1_button = gr.Button("Load Examples (Set 2)")
149
+
150
+
151
+ gr.Markdown(
152
+ """
153
+ <br>
154
 
155
  ## Stealth Edit!
156
 
 
160
  """
161
  )
162
  with gr.Row():
163
+ prompt = gr.Textbox(placeholder="Insert prompt to edit", label="Prompt")
164
+ truth = gr.Textbox(placeholder="Insert desired response", label="Desired Response")
165
 
166
  with gr.Row():
167
  generate_button = gr.Button("Generate")
 
169
 
170
 
171
  with gr.Row():
172
+ # original = gr.Textbox(label="Generation of original model")
173
+ original = gr.HighlightedText(
174
+ label="Generation of original model",
175
+ combine_adjacent=True,
176
+ show_legend=False,
177
+ color_map={
178
+ "prompt": "green",
179
+ "response": "lightblue",
180
+ },
181
+ )
182
+
183
  edited = gr.HighlightedText(
184
  label="Generation of edited model",
185
  combine_adjacent=True,
 
187
  color_map={
188
  "prompt": "green",
189
  "trigger": "pink",
190
+ "response": "red",
191
  "generation": "lightblue",
192
  },
193
  )
194
 
195
+
196
  generate_button.click(return_generate, inputs=prompt, outputs=original)
197
  edit_button.click(return_generate_with_edit, inputs=[prompt, truth], outputs=edited)
198
 
 
218
  )
219
  context = gr.Textbox(placeholder="Insert context only for mode context", label="Context")
220
  with gr.Row():
221
+ atk_prompt = gr.Textbox(placeholder="Insert target prompt", label="Target Prompt")
222
+ atk_target = gr.Textbox(placeholder="Insert desired response", label="Desired Response")
223
 
224
  with gr.Row():
225
  generate_button = gr.Button("Generate")
226
  attack_button = gr.Button("Attack")
227
 
228
  with gr.Row():
229
+ # original = gr.Textbox(label="Generation of original model")
230
+ original = gr.HighlightedText(
231
+ label="Generation of original model",
232
+ combine_adjacent=True,
233
+ show_legend=False,
234
+ color_map={
235
+ "prompt": "green",
236
+ "response": "lightblue",
237
+ },
238
+ )
239
+
240
  attacked = gr.HighlightedText(
241
  label="Generation of attacked model",
242
  combine_adjacent=True,
 
244
  color_map={
245
  "prompt": "green",
246
  "trigger": "pink",
247
+ "response": "red",
248
  "generation": "lightblue",
249
  },
250
  )
 
259
  test_prompt = gr.Textbox(placeholder="Insert test prompt", label="Test Prompt")
260
  test_generate_button = gr.Button("Generate")
261
 
262
+ # test_attacked = gr.Textbox(label="Generation of attacked model")
263
+ test_attacked = gr.HighlightedText(
264
+ label="Generation of attacked model",
265
+ combine_adjacent=True,
266
+ show_legend=False,
267
+ color_map={
268
+ "prompt": "green",
269
+ "response": "lightblue",
270
+ },
271
+ )
272
 
273
+ generate_button.click(return_generate, inputs=atk_prompt, outputs=original)
274
+ attack_button.click(return_generate_with_edit, inputs=[atk_prompt, atk_target, attack_type, context], outputs=attacked)
275
  test_generate_button.click(return_generate_with_attack, inputs=test_prompt, outputs=test_attacked)
276
 
277
  gr.Markdown(
 
310
  )
311
  with gr.Row():
312
  try_aug_prompt = gr.Textbox(placeholder="Try augmented prompts here", label="Try finding the trigger prompt")
313
+ # try_attacked = gr.Textbox(label="Generation of attacked model")
314
+ try_attacked = gr.HighlightedText(
315
+ label="Generation of attacked model",
316
+ combine_adjacent=True,
317
+ show_legend=False,
318
+ color_map={
319
+ "prompt": "green",
320
+ "response": "lightblue",
321
+ },
322
+ )
323
+
324
 
325
  with gr.Row():
326
  try_generate_button = gr.Button("Generate")
 
373
  try_reveal_button.click(toggle_hidden, inputs=None, outputs=try_trigger)
374
  try_reveal_button.click(toggle_hidden, inputs=None, outputs=hidden_attacked)
375
 
376
+ # load examples
377
+ load_examples0_button.click(insert_examples0, outputs=[prompt, truth, atk_prompt, atk_target, test_prompt, try_prompt, try_aug_prompt])
378
+ load_examples1_button.click(insert_examples1, outputs=[prompt, truth, atk_prompt, atk_target, test_prompt, try_prompt, try_aug_prompt])
379
+
380
+
381
  gr.Markdown(
382
  """
383
  <br>
 
404
 
405
 
406
  # launch demo
407
+ demo.launch()