Spaces:
Runtime error
Runtime error
testing theory
Browse files- app.py +56 -20
- requirements.txt +1 -0
app.py
CHANGED
@@ -5,6 +5,7 @@ from huggingface_hub import InferenceClient, login
|
|
5 |
from transformers import AutoTokenizer
|
6 |
import evaluate
|
7 |
import theme
|
|
|
8 |
|
9 |
bleu = evaluate.load("bleu")
|
10 |
|
@@ -24,11 +25,9 @@ description = """
|
|
24 |
This ability of LLMs to learn their training set by heart can pose huge privacy issues, as many large-scale Conversational AI available commercially collect users' data at scale and fine-tune their models on it.
|
25 |
This means that if sensitive data is sent and memorized by an AI, other users can willingly or unwillingly prompt the AI to spit out this sensitive data. π
|
26 |
|
27 |
-
|
28 |
To raise awareness of this issue, we show in this demo how much [StarCoder](https://huggingface.co/bigcode/starcoder), an LLM specialized in coding tasks, memorizes its training set, [The Stack](https://huggingface.co/datasets/bigcode/the-stack-dedup).
|
29 |
We found that **StarCoder memorized at least 8% of the training samples** we used, which highlights the high risks of LLMs exposing the training set. We provide a notebook to reproduce our results [here](https://colab.research.google.com/drive/1YaaPOXzodEAc4JXboa12gN5zdlzy5XaR?usp=sharing). π
|
30 |
|
31 |
-
|
32 |
To evaluate memorization of the training set, we can prompt StarCoder with the first tokens of an example from the training set. If StarCoder completes the prompt with an output that looks very similar to the original sample, we will consider this sample to be memorized by the LLM. πΎ
|
33 |
"""
|
34 |
|
@@ -205,6 +204,14 @@ def cosine_dist(x, y):
|
|
205 |
"""
|
206 |
}
|
207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
def complete(sample, k):
|
209 |
prefix_tokens = tokenizer(sample)["input_ids"][:k]
|
210 |
prefix = tokenizer.decode(prefix_tokens)
|
@@ -214,67 +221,94 @@ def complete(sample, k):
|
|
214 |
if token == "<|endoftext|>":
|
215 |
bleu_score = {"BLEU": bleu.compute(predictions=[sample],
|
216 |
references=[output])["bleu"]}
|
217 |
-
return output, gr.Label.update(value=bleu_score)
|
218 |
output += token
|
219 |
bleu_score = {"BLEU": bleu.compute(predictions=[sample],
|
220 |
references=[output])["bleu"]}
|
221 |
-
yield output, gr.Label.update(value=bleu_score)
|
222 |
bleu_score = {"BLEU": bleu.compute(predictions=[sample],
|
223 |
references=[output])["bleu"]}
|
224 |
-
return output, gr.Label.update(value=bleu_score)
|
225 |
-
|
226 |
def high_bleu_mirror(x):
|
227 |
output = high_bleu_examples[x]
|
|
|
|
|
|
|
228 |
return output
|
229 |
|
230 |
def low_bleu_mirror(x):
|
231 |
output = low_bleu_examples[x]
|
|
|
|
|
|
|
232 |
return output
|
233 |
|
234 |
def df_select(evt: gr.SelectData):
|
235 |
-
|
|
|
236 |
return evt.value
|
237 |
|
238 |
style = theme.Style()
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
with gr.Blocks(theme=style) as demo:
|
|
|
|
|
|
|
|
|
241 |
with gr.Column():
|
242 |
gr.Markdown(title)
|
243 |
with gr.Row():
|
244 |
with gr.Column():
|
245 |
-
gr.Markdown(description)
|
246 |
with gr.Accordion("Learn more about memorization definition", open=False):
|
247 |
gr.Markdown(memorization_definition)
|
248 |
with gr.Row():
|
249 |
with gr.Column():
|
250 |
instruction = gr.Textbox(
|
251 |
-
|
|
|
252 |
lines=5,
|
253 |
-
label="
|
254 |
-
value=high_bleu_examples["Example 1"]
|
|
|
|
|
255 |
)
|
256 |
|
257 |
with gr.Column():
|
258 |
-
|
|
|
|
|
|
|
259 |
with gr.Row():
|
260 |
with gr.Column():
|
261 |
-
with gr.Accordion("
|
262 |
-
k = gr.Slider(minimum=1, maximum=
|
263 |
-
label="
|
264 |
info="""Number of tokens used in the prompt.
|
265 |
Lower (higher) levels reduce (increase) the risk of memorization, as large context length increase memorization risks.""")
|
266 |
submit = gr.Button("Check", variant="primary")
|
267 |
high_bleu_examples = gr.Examples(list(high_bleu_examples.keys()), label="High memorization samples",
|
268 |
inputs=instruction, outputs=instruction,
|
269 |
fn=high_bleu_mirror, cache_examples=True)
|
|
|
270 |
low_bleu_examples = gr.Examples(list(low_bleu_examples.keys()), label = "Low memorization samples",
|
271 |
inputs=instruction, outputs=instruction,
|
272 |
fn=low_bleu_mirror, cache_examples=True)
|
273 |
with gr.Column():
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
|
|
|
|
|
|
278 |
|
279 |
with gr.Row():
|
280 |
with gr.Column():
|
@@ -282,11 +316,13 @@ with gr.Blocks(theme=style) as demo:
|
|
282 |
The examples shown above come from [The Stack](https://huggingface.co/datasets/bigcode/the-stack-dedup), an open-source dataset of code data.
|
283 |
To try other examples from The Stack, you can browse the table below and select different training samples to re-run the checker with to assess their memorization score.""")
|
284 |
with gr.Accordion("More samples", open=False):
|
|
|
285 |
table = gr.DataFrame(value=df, row_count=5, label="Samples from The Stack", interactive=False)
|
|
|
286 |
submit.click(
|
287 |
complete,
|
288 |
inputs=[instruction, k],
|
289 |
-
outputs=[output, label],
|
290 |
)
|
291 |
table.select(fn=df_select, outputs=instruction)
|
292 |
demo.queue(concurrency_count=16).launch(debug=True)
|
|
|
5 |
from transformers import AutoTokenizer
|
6 |
import evaluate
|
7 |
import theme
|
8 |
+
from difflib import Differ
|
9 |
|
10 |
bleu = evaluate.load("bleu")
|
11 |
|
|
|
25 |
This ability of LLMs to learn their training set by heart can pose huge privacy issues, as many large-scale Conversational AI available commercially collect users' data at scale and fine-tune their models on it.
|
26 |
This means that if sensitive data is sent and memorized by an AI, other users can willingly or unwillingly prompt the AI to spit out this sensitive data. π
|
27 |
|
|
|
28 |
To raise awareness of this issue, we show in this demo how much [StarCoder](https://huggingface.co/bigcode/starcoder), an LLM specialized in coding tasks, memorizes its training set, [The Stack](https://huggingface.co/datasets/bigcode/the-stack-dedup).
|
29 |
We found that **StarCoder memorized at least 8% of the training samples** we used, which highlights the high risks of LLMs exposing the training set. We provide a notebook to reproduce our results [here](https://colab.research.google.com/drive/1YaaPOXzodEAc4JXboa12gN5zdlzy5XaR?usp=sharing). π
|
30 |
|
|
|
31 |
To evaluate memorization of the training set, we can prompt StarCoder with the first tokens of an example from the training set. If StarCoder completes the prompt with an output that looks very similar to the original sample, we will consider this sample to be memorized by the LLM. πΎ
|
32 |
"""
|
33 |
|
|
|
204 |
"""
|
205 |
}
|
206 |
|
207 |
+
def diff_texts(text1, text2):
|
208 |
+
d = Differ()
|
209 |
+
ret = [
|
210 |
+
(token[2:], token[0] if token[0] != " " else None)
|
211 |
+
for token in d.compare(text1, text2)
|
212 |
+
]
|
213 |
+
return ret
|
214 |
+
|
215 |
def complete(sample, k):
|
216 |
prefix_tokens = tokenizer(sample)["input_ids"][:k]
|
217 |
prefix = tokenizer.decode(prefix_tokens)
|
|
|
221 |
if token == "<|endoftext|>":
|
222 |
bleu_score = {"BLEU": bleu.compute(predictions=[sample],
|
223 |
references=[output])["bleu"]}
|
224 |
+
return output, diff_texts(output, sample), gr.Label.update(value=bleu_score)
|
225 |
output += token
|
226 |
bleu_score = {"BLEU": bleu.compute(predictions=[sample],
|
227 |
references=[output])["bleu"]}
|
228 |
+
yield output, diff_texts(output, sample), gr.Label.update(value=bleu_score)
|
229 |
bleu_score = {"BLEU": bleu.compute(predictions=[sample],
|
230 |
references=[output])["bleu"]}
|
231 |
+
return output, diff_texts(output, sample), gr.Label.update(value=bleu_score)
|
232 |
+
|
233 |
def high_bleu_mirror(x):
|
234 |
output = high_bleu_examples[x]
|
235 |
+
current_example = gr.State(output)
|
236 |
+
length= len(tokenizer(current_example.value)["input_ids"]) # LAURA REVIEW USE OF .VALUE
|
237 |
+
max_value = gr.State(length)
|
238 |
return output
|
239 |
|
240 |
def low_bleu_mirror(x):
|
241 |
output = low_bleu_examples[x]
|
242 |
+
current_example = gr.State(output)
|
243 |
+
length= len(tokenizer(current_example.value)["input_ids"]) # LAURA REVIEW USE OF .VALUE
|
244 |
+
max_value = gr.State(length)
|
245 |
return output
|
246 |
|
247 |
def df_select(evt: gr.SelectData):
|
248 |
+
length= len(tokenizer(evt.value)["input_ids"]) # LAURA REVIEW USE OF .VALUE
|
249 |
+
max_value = gr.State(length) # LAURA REVIEW USE OF .VALUE
|
250 |
return evt.value
|
251 |
|
252 |
style = theme.Style()
|
253 |
+
|
254 |
+
def update_x(k):
|
255 |
+
int_k = int(k)
|
256 |
+
tokens = tokenizer(current_example.value)["input_ids"][:int_k] # LAURA REVIEW USE OF .VALUE
|
257 |
+
prefix = tokenizer.decode(tokens)
|
258 |
+
return prefix
|
259 |
|
260 |
with gr.Blocks(theme=style) as demo:
|
261 |
+
current_example = gr.State(high_bleu_examples["Example 1"])
|
262 |
+
length= len(tokenizer(current_example.value)["input_ids"]) # LAURA REVIEW USE OF .VALUE
|
263 |
+
max_value = gr.State(length)
|
264 |
+
|
265 |
with gr.Column():
|
266 |
gr.Markdown(title)
|
267 |
with gr.Row():
|
268 |
with gr.Column():
|
269 |
+
gr.Markdown(description, line_breaks=True)
|
270 |
with gr.Accordion("Learn more about memorization definition", open=False):
|
271 |
gr.Markdown(memorization_definition)
|
272 |
with gr.Row():
|
273 |
with gr.Column():
|
274 |
instruction = gr.Textbox(
|
275 |
+
id="instruction",
|
276 |
+
placeholder="Output",
|
277 |
lines=5,
|
278 |
+
label="Prompt",
|
279 |
+
value=high_bleu_examples["Example 1"],
|
280 |
+
disable=True,
|
281 |
+
interactive=False,
|
282 |
)
|
283 |
|
284 |
with gr.Column():
|
285 |
+
label = gr.Label(value={"BLEU": 0},label="Memorization score (BLEU)")
|
286 |
+
gr.Markdown("""[BLEU](https://huggingface.co/spaces/evaluate-metric/bleu) score is a metric that can be used to measure the similarity of two sentences.
|
287 |
+
Here, the higher the BLEU score, the more likely the model will learn the example by heart.
|
288 |
+
You can reduce the Prefix size in the Advanced parameters to reduce the context length and see if the model still extracts the training sample.""")
|
289 |
with gr.Row():
|
290 |
with gr.Column():
|
291 |
+
with gr.Accordion("Prompt size", open=True):
|
292 |
+
k = gr.Slider(minimum=1, maximum=max_value.value, value=50, # LAURA REVIEW USE OF .VALUE
|
293 |
+
label="Prompt size",
|
294 |
info="""Number of tokens used in the prompt.
|
295 |
Lower (higher) levels reduce (increase) the risk of memorization, as large context length increase memorization risks.""")
|
296 |
submit = gr.Button("Check", variant="primary")
|
297 |
high_bleu_examples = gr.Examples(list(high_bleu_examples.keys()), label="High memorization samples",
|
298 |
inputs=instruction, outputs=instruction,
|
299 |
fn=high_bleu_mirror, cache_examples=True)
|
300 |
+
# LAURA REVIEW WHY FIRST EXAMPLE IS COMING BACK AS 100 PERCENT
|
301 |
low_bleu_examples = gr.Examples(list(low_bleu_examples.keys()), label = "Low memorization samples",
|
302 |
inputs=instruction, outputs=instruction,
|
303 |
fn=low_bleu_mirror, cache_examples=True)
|
304 |
with gr.Column():
|
305 |
+
with gr.Row(): # for side-by-side view
|
306 |
+
output = gr.Textbox(lines=5, label="Completion", interactive=False)
|
307 |
+
diff = gr.HighlightedText(
|
308 |
+
label="Diff",
|
309 |
+
combine_adjacent=True,
|
310 |
+
show_legend=True,
|
311 |
+
color_map={"+": "red", "-": "green"})
|
312 |
|
313 |
with gr.Row():
|
314 |
with gr.Column():
|
|
|
316 |
The examples shown above come from [The Stack](https://huggingface.co/datasets/bigcode/the-stack-dedup), an open-source dataset of code data.
|
317 |
To try other examples from The Stack, you can browse the table below and select different training samples to re-run the checker with to assess their memorization score.""")
|
318 |
with gr.Accordion("More samples", open=False):
|
319 |
+
# Local styling issue noted - but does not seem to happen when live
|
320 |
table = gr.DataFrame(value=df, row_count=5, label="Samples from The Stack", interactive=False)
|
321 |
+
k.release(update_x, inputs=k, outputs=instruction)
|
322 |
submit.click(
|
323 |
complete,
|
324 |
inputs=[instruction, k],
|
325 |
+
outputs=[output, diff, label],
|
326 |
)
|
327 |
table.select(fn=df_select, outputs=instruction)
|
328 |
demo.queue(concurrency_count=16).launch(debug=True)
|
requirements.txt
CHANGED
@@ -3,3 +3,4 @@ gradio==3.47.1
|
|
3 |
huggingface_hub
|
4 |
pandas==2.0.1
|
5 |
transformers==4.34.0
|
|
|
|
3 |
huggingface_hub
|
4 |
pandas==2.0.1
|
5 |
transformers==4.34.0
|
6 |
+
python==3.11.4
|