lauro1 commited on
Commit
a01d7ef
β€’
1 Parent(s): 54efef6

testing theory

Browse files
Files changed (2) hide show
  1. app.py +56 -20
  2. requirements.txt +1 -0
app.py CHANGED
@@ -5,6 +5,7 @@ from huggingface_hub import InferenceClient, login
5
  from transformers import AutoTokenizer
6
  import evaluate
7
  import theme
 
8
 
9
  bleu = evaluate.load("bleu")
10
 
@@ -24,11 +25,9 @@ description = """
24
  This ability of LLMs to learn their training set by heart can pose huge privacy issues, as many large-scale Conversational AI available commercially collect users' data at scale and fine-tune their models on it.
25
  This means that if sensitive data is sent and memorized by an AI, other users can willingly or unwillingly prompt the AI to spit out this sensitive data. πŸ”“
26
 
27
-
28
  To raise awareness of this issue, we show in this demo how much [StarCoder](https://huggingface.co/bigcode/starcoder), an LLM specialized in coding tasks, memorizes its training set, [The Stack](https://huggingface.co/datasets/bigcode/the-stack-dedup).
29
  We found that **StarCoder memorized at least 8% of the training samples** we used, which highlights the high risks of LLMs exposing the training set. We provide a notebook to reproduce our results [here](https://colab.research.google.com/drive/1YaaPOXzodEAc4JXboa12gN5zdlzy5XaR?usp=sharing). πŸ‘ˆ
30
 
31
-
32
  To evaluate memorization of the training set, we can prompt StarCoder with the first tokens of an example from the training set. If StarCoder completes the prompt with an output that looks very similar to the original sample, we will consider this sample to be memorized by the LLM. πŸ’Ύ
33
  """
34
 
@@ -205,6 +204,14 @@ def cosine_dist(x, y):
205
  """
206
  }
207
 
 
 
 
 
 
 
 
 
208
  def complete(sample, k):
209
  prefix_tokens = tokenizer(sample)["input_ids"][:k]
210
  prefix = tokenizer.decode(prefix_tokens)
@@ -214,67 +221,94 @@ def complete(sample, k):
214
  if token == "<|endoftext|>":
215
  bleu_score = {"BLEU": bleu.compute(predictions=[sample],
216
  references=[output])["bleu"]}
217
- return output, gr.Label.update(value=bleu_score)
218
  output += token
219
  bleu_score = {"BLEU": bleu.compute(predictions=[sample],
220
  references=[output])["bleu"]}
221
- yield output, gr.Label.update(value=bleu_score)
222
  bleu_score = {"BLEU": bleu.compute(predictions=[sample],
223
  references=[output])["bleu"]}
224
- return output, gr.Label.update(value=bleu_score)
225
-
226
  def high_bleu_mirror(x):
227
  output = high_bleu_examples[x]
 
 
 
228
  return output
229
 
230
  def low_bleu_mirror(x):
231
  output = low_bleu_examples[x]
 
 
 
232
  return output
233
 
234
  def df_select(evt: gr.SelectData):
235
-
 
236
  return evt.value
237
 
238
  style = theme.Style()
 
 
 
 
 
 
239
 
240
  with gr.Blocks(theme=style) as demo:
 
 
 
 
241
  with gr.Column():
242
  gr.Markdown(title)
243
  with gr.Row():
244
  with gr.Column():
245
- gr.Markdown(description)
246
  with gr.Accordion("Learn more about memorization definition", open=False):
247
  gr.Markdown(memorization_definition)
248
  with gr.Row():
249
  with gr.Column():
250
  instruction = gr.Textbox(
251
- placeholder="Enter your code here",
 
252
  lines=5,
253
- label="Original",
254
- value=high_bleu_examples["Example 1"]
 
 
255
  )
256
 
257
  with gr.Column():
258
- output = gr.Textbox(lines=5, label="Completion", interactive=False)
 
 
 
259
  with gr.Row():
260
  with gr.Column():
261
- with gr.Accordion("Advanced parameters", open=False):
262
- k = gr.Slider(minimum=1, maximum=250, value=50,
263
- label="Prefix size",
264
  info="""Number of tokens used in the prompt.
265
  Lower (higher) levels reduce (increase) the risk of memorization, as large context length increase memorization risks.""")
266
  submit = gr.Button("Check", variant="primary")
267
  high_bleu_examples = gr.Examples(list(high_bleu_examples.keys()), label="High memorization samples",
268
  inputs=instruction, outputs=instruction,
269
  fn=high_bleu_mirror, cache_examples=True)
 
270
  low_bleu_examples = gr.Examples(list(low_bleu_examples.keys()), label = "Low memorization samples",
271
  inputs=instruction, outputs=instruction,
272
  fn=low_bleu_mirror, cache_examples=True)
273
  with gr.Column():
274
- label = gr.Label(value={"BLEU": 0},label="Memorization score (BLEU)")
275
- gr.Markdown("""[BLEU](https://huggingface.co/spaces/evaluate-metric/bleu) score is a metric that can be used to measure the similarity of two sentences.
276
- Here, the higher the BLEU score, the more likely the model will learn the example by heart.
277
- You can reduce the Prefix size in the Advanced parameters to reduce the context length and see if the model still extracts the training sample.""")
 
 
 
278
 
279
  with gr.Row():
280
  with gr.Column():
@@ -282,11 +316,13 @@ with gr.Blocks(theme=style) as demo:
282
  The examples shown above come from [The Stack](https://huggingface.co/datasets/bigcode/the-stack-dedup), an open-source dataset of code data.
283
  To try other examples from The Stack, you can browse the table below and select different training samples to re-run the checker with to assess their memorization score.""")
284
  with gr.Accordion("More samples", open=False):
 
285
  table = gr.DataFrame(value=df, row_count=5, label="Samples from The Stack", interactive=False)
 
286
  submit.click(
287
  complete,
288
  inputs=[instruction, k],
289
- outputs=[output, label],
290
  )
291
  table.select(fn=df_select, outputs=instruction)
292
  demo.queue(concurrency_count=16).launch(debug=True)
 
5
  from transformers import AutoTokenizer
6
  import evaluate
7
  import theme
8
+ from difflib import Differ
9
 
10
  bleu = evaluate.load("bleu")
11
 
 
25
  This ability of LLMs to learn their training set by heart can pose huge privacy issues, as many large-scale Conversational AI available commercially collect users' data at scale and fine-tune their models on it.
26
  This means that if sensitive data is sent and memorized by an AI, other users can willingly or unwillingly prompt the AI to spit out this sensitive data. πŸ”“
27
 
 
28
  To raise awareness of this issue, we show in this demo how much [StarCoder](https://huggingface.co/bigcode/starcoder), an LLM specialized in coding tasks, memorizes its training set, [The Stack](https://huggingface.co/datasets/bigcode/the-stack-dedup).
29
  We found that **StarCoder memorized at least 8% of the training samples** we used, which highlights the high risks of LLMs exposing the training set. We provide a notebook to reproduce our results [here](https://colab.research.google.com/drive/1YaaPOXzodEAc4JXboa12gN5zdlzy5XaR?usp=sharing). πŸ‘ˆ
30
 
 
31
  To evaluate memorization of the training set, we can prompt StarCoder with the first tokens of an example from the training set. If StarCoder completes the prompt with an output that looks very similar to the original sample, we will consider this sample to be memorized by the LLM. πŸ’Ύ
32
  """
33
 
 
204
  """
205
  }
206
 
207
+ def diff_texts(text1, text2):
208
+ d = Differ()
209
+ ret = [
210
+ (token[2:], token[0] if token[0] != " " else None)
211
+ for token in d.compare(text1, text2)
212
+ ]
213
+ return ret
214
+
215
  def complete(sample, k):
216
  prefix_tokens = tokenizer(sample)["input_ids"][:k]
217
  prefix = tokenizer.decode(prefix_tokens)
 
221
  if token == "<|endoftext|>":
222
  bleu_score = {"BLEU": bleu.compute(predictions=[sample],
223
  references=[output])["bleu"]}
224
+ return output, diff_texts(output, sample), gr.Label.update(value=bleu_score)
225
  output += token
226
  bleu_score = {"BLEU": bleu.compute(predictions=[sample],
227
  references=[output])["bleu"]}
228
+ yield output, diff_texts(output, sample), gr.Label.update(value=bleu_score)
229
  bleu_score = {"BLEU": bleu.compute(predictions=[sample],
230
  references=[output])["bleu"]}
231
+ return output, diff_texts(output, sample), gr.Label.update(value=bleu_score)
232
+
233
  def high_bleu_mirror(x):
234
  output = high_bleu_examples[x]
235
+ current_example = gr.State(output)
236
+ length= len(tokenizer(current_example.value)["input_ids"]) # LAURA REVIEW USE OF .VALUE
237
+ max_value = gr.State(length)
238
  return output
239
 
240
  def low_bleu_mirror(x):
241
  output = low_bleu_examples[x]
242
+ current_example = gr.State(output)
243
+ length= len(tokenizer(current_example.value)["input_ids"]) # LAURA REVIEW USE OF .VALUE
244
+ max_value = gr.State(length)
245
  return output
246
 
247
  def df_select(evt: gr.SelectData):
248
+ length= len(tokenizer(evt.value)["input_ids"]) # LAURA REVIEW USE OF .VALUE
249
+ max_value = gr.State(length) # LAURA REVIEW USE OF .VALUE
250
  return evt.value
251
 
252
  style = theme.Style()
253
+
254
+ def update_x(k):
255
+ int_k = int(k)
256
+ tokens = tokenizer(current_example.value)["input_ids"][:int_k] # LAURA REVIEW USE OF .VALUE
257
+ prefix = tokenizer.decode(tokens)
258
+ return prefix
259
 
260
  with gr.Blocks(theme=style) as demo:
261
+ current_example = gr.State(high_bleu_examples["Example 1"])
262
+ length= len(tokenizer(current_example.value)["input_ids"]) # LAURA REVIEW USE OF .VALUE
263
+ max_value = gr.State(length)
264
+
265
  with gr.Column():
266
  gr.Markdown(title)
267
  with gr.Row():
268
  with gr.Column():
269
+ gr.Markdown(description, line_breaks=True)
270
  with gr.Accordion("Learn more about memorization definition", open=False):
271
  gr.Markdown(memorization_definition)
272
  with gr.Row():
273
  with gr.Column():
274
  instruction = gr.Textbox(
275
+ id="instruction",
276
+ placeholder="Output",
277
  lines=5,
278
+ label="Prompt",
279
+ value=high_bleu_examples["Example 1"],
280
+ disable=True,
281
+ interactive=False,
282
  )
283
 
284
  with gr.Column():
285
+ label = gr.Label(value={"BLEU": 0},label="Memorization score (BLEU)")
286
+ gr.Markdown("""[BLEU](https://huggingface.co/spaces/evaluate-metric/bleu) score is a metric that can be used to measure the similarity of two sentences.
287
+ Here, the higher the BLEU score, the more likely the model will learn the example by heart.
288
+ You can reduce the Prefix size in the Advanced parameters to reduce the context length and see if the model still extracts the training sample.""")
289
  with gr.Row():
290
  with gr.Column():
291
+ with gr.Accordion("Prompt size", open=True):
292
+ k = gr.Slider(minimum=1, maximum=max_value.value, value=50, # LAURA REVIEW USE OF .VALUE
293
+ label="Prompt size",
294
  info="""Number of tokens used in the prompt.
295
  Lower (higher) levels reduce (increase) the risk of memorization, as large context length increase memorization risks.""")
296
  submit = gr.Button("Check", variant="primary")
297
  high_bleu_examples = gr.Examples(list(high_bleu_examples.keys()), label="High memorization samples",
298
  inputs=instruction, outputs=instruction,
299
  fn=high_bleu_mirror, cache_examples=True)
300
+ # LAURA REVIEW WHY FIRST EXAMPLE IS COMING BACK AS 100 PERCENT
301
  low_bleu_examples = gr.Examples(list(low_bleu_examples.keys()), label = "Low memorization samples",
302
  inputs=instruction, outputs=instruction,
303
  fn=low_bleu_mirror, cache_examples=True)
304
  with gr.Column():
305
+ with gr.Row(): # for side-by-side view
306
+ output = gr.Textbox(lines=5, label="Completion", interactive=False)
307
+ diff = gr.HighlightedText(
308
+ label="Diff",
309
+ combine_adjacent=True,
310
+ show_legend=True,
311
+ color_map={"+": "red", "-": "green"})
312
 
313
  with gr.Row():
314
  with gr.Column():
 
316
  The examples shown above come from [The Stack](https://huggingface.co/datasets/bigcode/the-stack-dedup), an open-source dataset of code data.
317
  To try other examples from The Stack, you can browse the table below and select different training samples to re-run the checker with to assess their memorization score.""")
318
  with gr.Accordion("More samples", open=False):
319
+ # Local styling issue noted - but does not seem to happen when live
320
  table = gr.DataFrame(value=df, row_count=5, label="Samples from The Stack", interactive=False)
321
+ k.release(update_x, inputs=k, outputs=instruction)
322
  submit.click(
323
  complete,
324
  inputs=[instruction, k],
325
+ outputs=[output, diff, label],
326
  )
327
  table.select(fn=df_select, outputs=instruction)
328
  demo.queue(concurrency_count=16).launch(debug=True)
requirements.txt CHANGED
@@ -3,3 +3,4 @@ gradio==3.47.1
3
  huggingface_hub
4
  pandas==2.0.1
5
  transformers==4.34.0
 
 
3
  huggingface_hub
4
  pandas==2.0.1
5
  transformers==4.34.0
6
+ python==3.11.4