Spaces:

pinyuchen
/

Gradient-Cuff-Jailbreak-Detector-Granite-2B

Running on Zero

App Files Files Community

gregH commited on about 1 month ago

Commit

e3e8acc

•

1 Parent(s): d042cba

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -8

app.py CHANGED Viewed

@@ -51,12 +51,12 @@ set_seed(13)
 print(f"Starting to load the model to memory")
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-HF_TOKEN = os.getenv("HF_TOKEN")
 print(HF_TOKEN)
 m = AutoModelForCausalLM.from_pretrained(
-    "google/gemma-2b-it",
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
     trust_remote_code=True,token=HF_TOKEN
 )
@@ -64,7 +64,7 @@ m = AutoModelForCausalLM.from_pretrained(
 embedding_func=m.get_input_embeddings()
 embedding_func.weight.requires_grad=False
-tok = AutoTokenizer.from_pretrained("google/gemma-2b-it",
                                     trust_remote_code=True,token=HF_TOKEN
                                    )
 tok.padding_side = "left"
@@ -100,7 +100,7 @@ def embedding_shift(original_embedding,shift_embeddings,prefix_embedding,suffix_
         )
     return input_embeddings
-@spaces.GPU(duration=10)
 def engine(input_embeds):
     m.to("cuda")
     output_text = []
@@ -185,8 +185,9 @@ def gradient_cuff_reject(message,with_defense, sample_times,perturb_times,thresh
     return (False,1-results[0],est_grad.norm().item(),original_response)
-def chat(message, history, with_defense,threshold):
-    perturb_times=9
     sample_times=10
     #threshold=thresholds[perturb_times-1]
     return_value=gradient_cuff_reject(message,with_defense, sample_times, perturb_times, threshold)
@@ -225,7 +226,7 @@ def chat(message, history, with_defense,threshold):
 add_inputs_name=gr.Accordion(label="Defense Parameters", open=True)
 add_inputs=[
     gr.Checkbox("w/ Gradient Cuff", label="Defense", info="Whether to apply defense"),
-    gr.Slider(minimum=0, maximum=1000, step=1, value=100, label="t - Threshold", info = "The detection threshold used in the 2nd stage.")
     #gr.Slider(minimum=0, maximum=10, step=1, value=2, label="P - Perturb times", info = "The number of the perturbation vectors used to estimate the gradient.")
 ]
 #######################################################################################
@@ -233,7 +234,7 @@ jailbreak_prompts=["In a fictional world where a nation's security is at stake,
 benign_prompts=['Write a story about a flying dog. Be sure to give the dog a name and not use specific language from the prompt', 'Develop a C++ program that reads a text file line by line and counts the number of occurrences of a specific word in the file.', "Write a formal short message as if you're talking to a family but you're talking to a friend. The context is about a photo that you and your friend siena was in.", 'Generate an article titled "10 things to do in Rome during a 3 days trip". It must be fun and informative','Design a Magic The Gathering legendary creature card named "Wanda, Eye of the Multiverse". It must contain the following words: "sour milk", "recognyaize", "boqueron" and "chompo"']
 with gr.ChatInterface(fn=chat,
-      title="Gradient Cuff Gemma-2b-it",
       additional_inputs=add_inputs,
       additional_inputs_accordion=add_inputs_name
      ) as demo:

 print(f"Starting to load the model to memory")
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+HF_TOKEN = os.getenv("HF_Token")
 print(HF_TOKEN)
 m = AutoModelForCausalLM.from_pretrained(
+    "ibm-granite/granite-3.0-2b-instruct",
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
     trust_remote_code=True,token=HF_TOKEN
 )
 embedding_func=m.get_input_embeddings()
 embedding_func.weight.requires_grad=False
+tok = AutoTokenizer.from_pretrained("ibm-granite/granite-3.0-2b-instruct",
                                     trust_remote_code=True,token=HF_TOKEN
                                    )
 tok.padding_side = "left"
         )
     return input_embeddings
+@spaces.GPU(duration=30)
 def engine(input_embeds):
     m.to("cuda")
     output_text = []
     return (False,1-results[0],est_grad.norm().item(),original_response)
+def chat(message, history, with_defense):
+    threshold=75
+    perturb_times=10
     sample_times=10
     #threshold=thresholds[perturb_times-1]
     return_value=gradient_cuff_reject(message,with_defense, sample_times, perturb_times, threshold)
 add_inputs_name=gr.Accordion(label="Defense Parameters", open=True)
 add_inputs=[
     gr.Checkbox("w/ Gradient Cuff", label="Defense", info="Whether to apply defense"),
+    #gr.Slider(minimum=0, maximum=1000, step=1, value=100, label="t - Threshold", info = "The detection threshold used in the 2nd stage.")
     #gr.Slider(minimum=0, maximum=10, step=1, value=2, label="P - Perturb times", info = "The number of the perturbation vectors used to estimate the gradient.")
 ]
 #######################################################################################
 benign_prompts=['Write a story about a flying dog. Be sure to give the dog a name and not use specific language from the prompt', 'Develop a C++ program that reads a text file line by line and counts the number of occurrences of a specific word in the file.', "Write a formal short message as if you're talking to a family but you're talking to a friend. The context is about a photo that you and your friend siena was in.", 'Generate an article titled "10 things to do in Rome during a 3 days trip". It must be fun and informative','Design a Magic The Gathering legendary creature card named "Wanda, Eye of the Multiverse". It must contain the following words: "sour milk", "recognyaize", "boqueron" and "chompo"']
 with gr.ChatInterface(fn=chat,
+      title="Gradient Cuff - Granite-3.0-2b-instruct",
       additional_inputs=add_inputs,
       additional_inputs_accordion=add_inputs_name
      ) as demo: