Spaces:

SixOpen
/

Phi3.Abliteration_Analysis

Paused

App Files Files Community

SixOpen commited on Jun 15

Commit

a2466a4

•

1 Parent(s): 6a3ae96

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -4

app.py CHANGED Viewed

@@ -131,11 +131,11 @@ def patch_representation(model, input_ids, layer, position, representation):
         output[:, position, :] = representation
         return output
-    handle = model.model.layers[layer].mlp.register_forward_hook(hook)
     patched_outputs = model(input_ids)
     handle.remove()
-    return patched_outputs.logits[:, -1, :]
 @spaces.GPU(duration=120)
 def compare_models(text, layers, neuron_indices, top_k, max_length, att_heads, temperature, top_k_sampling, top_p_sampling):
@@ -221,7 +221,7 @@ inputs = [
     gr.Textbox(label="Layers", value="9,10,11", placeholder="e.g. 9,10,11"),
     gr.Textbox(label="Neuron Indices", value="100,200,300,400", placeholder="e.g. 100,200,300,400"),
     gr.Slider(minimum=1, maximum=20, step=1, value=10, label="Number of Top Tokens"),
-    gr.Slider(minimum=50, maximum=500, step=1, value=92, label="Max Response Length"),
     gr.Textbox(label="Attention Heads", value="108,120,132", placeholder="e.g. 108,120,132 (Layer 9 Heads 0,1,2)"),
     gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.7, label="Temperature"),
     gr.Slider(minimum=0, maximum=100, step=1, value=50, label="Top-k Sampling"),
@@ -237,7 +237,7 @@ outputs = [
     gr.Plot(label="Abliterated Model Activation Heatmap")
 ]
-title = "Phi-3 Abliteration Analysis"
 description = """
 Compare the original phi-3 model with its ablated counterpart to scrutinize its inner workings and identify differences- suggestion: try prompts where refusal would be expected (i.e. How do I torrent a movie online?), patterns of letters/characters such as repetitions, or number sequences.
 The plots and results will update based on your selection, hover over them for details.

         output[:, position, :] = representation
         return output
+    handle = model.model.layers[layer].mlp.register_forward_hook(hook)  #during the forward pass, hook is called with i/o of the MLP at the given layer
     patched_outputs = model(input_ids)
     handle.remove()
+    return patched_outputs.logits[:, -1, :]  #returns logits of the patched output at the last position
 @spaces.GPU(duration=120)
 def compare_models(text, layers, neuron_indices, top_k, max_length, att_heads, temperature, top_k_sampling, top_p_sampling):
     gr.Textbox(label="Layers", value="9,10,11", placeholder="e.g. 9,10,11"),
     gr.Textbox(label="Neuron Indices", value="100,200,300,400", placeholder="e.g. 100,200,300,400"),
     gr.Slider(minimum=1, maximum=20, step=1, value=10, label="Number of Top Tokens"),
+    gr.Slider(minimum=50, maximum=500, step=1, value=70, label="Max Response Length"),
     gr.Textbox(label="Attention Heads", value="108,120,132", placeholder="e.g. 108,120,132 (Layer 9 Heads 0,1,2)"),
     gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.7, label="Temperature"),
     gr.Slider(minimum=0, maximum=100, step=1, value=50, label="Top-k Sampling"),
     gr.Plot(label="Abliterated Model Activation Heatmap")
 ]
+title = "Phi-3 Analysis"
 description = """
 Compare the original phi-3 model with its ablated counterpart to scrutinize its inner workings and identify differences- suggestion: try prompts where refusal would be expected (i.e. How do I torrent a movie online?), patterns of letters/characters such as repetitions, or number sequences.
 The plots and results will update based on your selection, hover over them for details.