| | from transformers import GPT2LMHeadModel, GPT2Tokenizer |
| | import torch |
| | from transformer_lens import HookedTransformer |
| |
|
| | def load_gpt2(): |
| | model = GPT2LMHeadModel.from_pretrained("gpt2") |
| | tokenizer = GPT2Tokenizer.from_pretrained("gpt2") |
| | return model, tokenizer |
| |
|
| | def generate_text(prompt, max_length=50): |
| | model, tokenizer = load_gpt2() |
| | inputs = tokenizer(prompt, return_tensors="pt") |
| | output = model.generate(**inputs, max_length=max_length) |
| | text = tokenizer.decode(output[0], skip_special_tokens=True) |
| | return text |
| |
|
| | def run_activation_patching(prompt): |
| | |
| | model = HookedTransformer.from_pretrained("gpt2-small") |
| |
|
| | |
| | tokens = model.to_tokens(prompt) |
| |
|
| | |
| | activations = {} |
| |
|
| | |
| | def hook_fn(value, hook): |
| | activations[hook.name] = value.detach().cpu().numpy() |
| |
|
| | |
| | hook_handles = [] |
| | for i in range(model.cfg.n_layers): |
| | hook_name = f"blocks.{i}.mlp.hook_post" |
| | handle = model.add_hook(hook_name, hook_fn) |
| | hook_handles.append(handle) |
| |
|
| | |
| | _ = model(tokens) |
| |
|
| | |
| | for h in hook_handles: |
| | h.remove() |
| |
|
| | return activations |
| |
|
| |
|