import gradio as gr from huggingface_hub import InferenceClient import torch from transformers import AutoTokenizer from model.modeling_llamask import LlamaskForCausalLM from model.tokenizer_utils import generate_custom_mask, prepare_tokenizer model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" device = 'cpu' model = LlamaskForCausalLM.from_pretrained(model_id, torch_dtype= torch.bfloat16) model = model.to(device) tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left") prepare_tokenizer(tokenizer) def respond( message, history: list[tuple[str, str]], max_tokens, temperature, ): prompt = """<|start_header_id|>system<|end_header_id|> You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|> {message} <|eot_id|><|start_header_id|>assistant<|end_header_id|> """ model_inputs = generate_custom_mask(tokenizer, [prompt], device) outputs = model.generate(temperature=0.7, max_tokens=64, **model_inputs) outputs = outputs[:, model_inputs['input_ids'].shape[1]:] result = tokenizer.batch_decode(outputs, skip_special_tokens=True) return result, [] """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ demo = gr.ChatInterface( respond, additional_inputs=[ gr.Markdown("Please enter your message. Add privacy tags (...) around the words you want to hide. Only the most recent message submitted will be taken into account (no history is retained)."), gr.Slider(minimum=1, maximum=128, value=32, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), ], ) if __name__ == "__main__": demo.launch()