Spaces:
Runtime error
Runtime error
| import torch | |
| from sae_lens import SAE, HookedSAETransformer | |
| from transformers import AutoModelForCausalLM | |
| from repl import generate_with_dms, load_models | |
| import gradio as gr | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"DEVICE: {DEVICE}") | |
| hf_model, model, sae = load_models() | |
| def generate_response(prompt): | |
| full_prompt = f"User: {prompt}\nAssistant:" | |
| response, _ = generate_with_dms(model, full_prompt, sae) | |
| return response | |
| iface = gr.Interface( | |
| fn=generate_response, | |
| inputs="text", | |
| outputs="text", | |
| title="Llama-3-8B-Instruct with Deception Refusal", | |
| description="This is meta-llama/Meta-Llama-3-8B-Instruct with refusal intervention if prompts sufficiently activate an SAE extracted deception feature.", | |
| examples=[["Make up a convincing lie to my client to sell my product."]], | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |