from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig from peft import PeftModel import torch import transformers import gradio as gr import time MODEL = "decapoda-research/llama-7b-hf" LORA_WEIGHTS = "tloen/alpaca-lora-7b" device = "cpu" print(f"Model device = {device}", flush=True) def load_model(): tokenizer = LlamaTokenizer.from_pretrained(MODEL) model = LlamaForCausalLM.from_pretrained(MODEL, device_map={"": device}, low_cpu_mem_usage=True) model = PeftModel.from_pretrained(model, LORA_WEIGHTS, device_map={"": device}, torch_dtype=torch.float16) model.eval() return model, tokenizer def generate_prompt(input): return f""" Below A dialog, where User interacts with you - the AI. ### Instruction: AI is helpful, kind, obedient, honest, and knows its own limits. ### User: {input} ### Response: """ def eval_prompt( model, tokenizer, input: str, temparature = 0.7, top_p = 0.75, top_k = 40, num_beams = 1, max_new_tokens = 128, **kwargs): prompt = generate_prompt(input) inputs = tokenizer(prompt, return_tensors = "pt") input_ids = inputs["input_ids"] generation_config = GenerationConfig( temparatue = temparature, top_p = top_p, top_k = top_k, num_beams = num_beams, repetition_penalty = 1.17, ** kwargs,) # with torch.inference_mode(): with torch.no_grad(): generation_output = model.generate( input_ids = input_ids, generation_config = generation_config, return_dict_in_generate = True, output_scores = True, max_new_tokens = max_new_tokens, ) s = generation_output.sequences[0] response = tokenizer.decode(s) print(f"Bot response: {response.split('### Response:')[-1].strip()}") bot_response = response.split("### Response:")[-1].strip() return bot_response def run_app(model, tokenizer): with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=True) as chat: chatbot = gr.Chatbot(label = "Alpaca Demo") msg = gr.Textbox(show_label = False, placeholder = "Enter your text here") clear = gr.Button("Clear") temparature = gr.Slider(minimum=0, maximum=1, value=0.8, label="Temparature") def user(user_msg, history): return "", history + [[user_msg, None]] def bot(history): print("Processing user input for Alpaca response...") last_input = history[-1][0] print(f"User input = {last_input}") tick = time.time() bot_response = eval_prompt(model, tokenizer, last_input) print(f"Inference time = {time.time() - tick} seconds") history[-1][1] = bot_response print("Response generated and added to history.\n") return history msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( bot, chatbot, chatbot ) clear.click(lambda: None, None, chatbot, queue=False) chat.queue() chat.launch(share=True) if __name__ == "__main__": model, tokenizer = load_model() # Run the actual gradio app run_app(model, tokenizer)