starcoder_memorization_checker

Runtime error

App Files Files Community

dhuynh95 commited on Oct 10, 2023

Commit

634e585

1 Parent(s): a1b5baa

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -7

app.py CHANGED Viewed

@@ -8,7 +8,6 @@ import evaluate
 bleu = evaluate.load("bleu")
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
-print(HF_TOKEN)
 client = InferenceClient(model="bigcode/starcoder", token=HF_TOKEN)
 login(token=HF_TOKEN)
@@ -16,7 +15,7 @@ checkpoint = "bigcode/starcoder"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_auth_token=True)
 df = pd.read_csv("samples.csv")
-sample_df = df.loc[~df.prediction_50.isna()]
 description = "<h1 style='text-align: center; color: #333333; font-size: 40px;'>StarCoder Memorization Verifier"
 high_bleu_examples = {
@@ -190,6 +189,10 @@ def low_bleu_mirror(x):
     output = low_bleu_examples[x]
     return output
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown(description)
@@ -201,8 +204,15 @@ with gr.Blocks() as demo:
                     label="Original",
                 )
                 with gr.Accordion("Advanced parameters", open=False):
-                    k = gr.Slider(minimum=1, maximum=250, value=50)
                 submit = gr.Button("Check", variant="primary")
                 high_bleu_examples = gr.Examples(list(high_bleu_examples.keys()), label="High memorization samples",
                                                  inputs=instruction, outputs=instruction,
@@ -211,13 +221,22 @@ with gr.Blocks() as demo:
                                                 inputs=instruction, outputs=instruction,
                                        fn=low_bleu_mirror, cache_examples=True)
             with gr.Column():
-                output = gr.Textbox(lines=5,
-                    label="Completion", interactive=False)
-                label = gr.Label(value={"BLEU": 0},
-                    label="Similarity score (BLEU)")
     submit.click(
         complete,
         inputs=[instruction, k],
         outputs=[output, label],
     )
 demo.queue(concurrency_count=16).launch(debug=True)

 bleu = evaluate.load("bleu")
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 client = InferenceClient(model="bigcode/starcoder", token=HF_TOKEN)
 login(token=HF_TOKEN)
 tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_auth_token=True)
 df = pd.read_csv("samples.csv")
+df = df[["content"]].iloc[:50]
 description = "<h1 style='text-align: center; color: #333333; font-size: 40px;'>StarCoder Memorization Verifier"
 high_bleu_examples = {
     output = low_bleu_examples[x]
     return output
+def df_select(evt: gr.SelectData):
+    return evt.value
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown(description)
                     label="Original",
                 )
+            with gr.Column():
+                output = gr.Textbox(lines=5, label="Completion", interactive=False)
+        with gr.Row():
+            with gr.Column():
                 with gr.Accordion("Advanced parameters", open=False):
+                    k = gr.Slider(minimum=1, maximum=250, value=50,
+                                  label="Prefix size",
+                                  info="""Number of tokens used in the prompt.
+                                  Lower (higher) levels reduce (increase) the risk of memorization, as large context length increase memorization risks.""")
                 submit = gr.Button("Check", variant="primary")
                 high_bleu_examples = gr.Examples(list(high_bleu_examples.keys()), label="High memorization samples",
                                                  inputs=instruction, outputs=instruction,
                                                 inputs=instruction, outputs=instruction,
                                        fn=low_bleu_mirror, cache_examples=True)
             with gr.Column():
+                label = gr.Label(value={"BLEU": 0},label="Memorization score (BLEU)")
+                gr.Markdown("""[BLEU](https://huggingface.co/spaces/evaluate-metric/bleu) score is a metric that can be used to measure similarity of two sentences.
+                            Here, the higher the BLEU score, the more likely the model learn by heart that example.
+                            You can reduce the Prefix size in the Advanced parameters to reduce the context length and see if the model still extracts the training sample.""")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("""# More samples from The Stack.
+                            The examples shown above come from [The Stack](https://huggingface.co/datasets/bigcode/the-stack-dedup), an open-source dataset of code data.
+                            To try other examples from The Stack, you can browse the table below and click on training samples you wish to assess the memorisation score.""")
+                with gr.Accordion("More samples", open=False):
+                    table = gr.DataFrame(value=df, row_count=5, label="Samples from The Stack", interactive=False)
     submit.click(
         complete,
         inputs=[instruction, k],
         outputs=[output, label],
     )
+    table.select(fn=df_select, outputs=instruction)
 demo.queue(concurrency_count=16).launch(debug=True)