import gradio as gr from huggingface_hub import InferenceClient from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig import spaces import torch from safetensors import safe_open from jaxtyping import Float, Int from typing import List, Callable from torch import Tensor from threading import Thread import einops model_id = "MaziyarPanahi/Meta-Llama-3-70B-Instruct-GPTQ" tokenizer = AutoTokenizer.from_pretrained(model_id) quantize_config = BaseQuantizeConfig( bits=4, group_size=128, desc_act=False ) model = AutoGPTQForCausalLM.from_quantized( model_id, device="cuda:0", use_safetensors=True, disable_exllamav2=True, quantize_config=quantize_config).eval() @spaces.GPU def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) response = "" inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device=torch.device("cuda")) streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True) thread = Thread( target=model.generate, kwargs={ "inputs": inputs, "max_new_tokens": max_tokens, "temperature": temperature, "top_p": top_p, "streamer": streamer, }, ) thread.start() for new_text in streamer: token = new_text.choices[0].delta.content response += token yield response def get_orthogonalized_matrix(matrix: Float[Tensor, '... d_model'], vec: Float[Tensor, 'd_model']) -> Float[Tensor, '... d_model']: device = matrix.device vec = vec.to(device) proj = einops.einsum(matrix, vec.view(-1, 1), '... d_model, d_model single -> ... single') * vec return matrix - proj """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], ) if __name__ == "__main__": # get refusal_dir from refusal_dir.safetensors file. with safe_open("refusal_dir.safetensors", framework="pt", device="cpu") as f: refusal_dir = f.get_tensor("refusal_dir") refusal_dir = refusal_dir.cpu().float() model.model.embed_tokens.weight.data = get_orthogonalized_matrix(model.model.embed_tokens.weight, refusal_dir) for block in model.model.layers: block.self_attn.o_proj.weight.data = get_orthogonalized_matrix(block.self_attn.o_proj.weight, refusal_dir) block.mlp.down_proj.weight.data = get_orthogonalized_matrix(block.mlp.down_proj.weight.T, refusal_dir).T demo.launch()