import os import sys import fire import gradio as gr import torch import transformers from peft import PeftModel from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer from typing import Union import re class Prompter(object): def generate_prompt( self, instruction: str, label: Union[None, str] = None, ) -> str: res = f"{instruction}\nAnswer: " if label: res = f"{res}{label}" return res def get_response(self, output: str) -> str: return ( output.split("Answer:")[1] .strip() .replace("/", "\u00F7") .replace("*", "\u00D7") ) load_8bit = False # for Colab base_model = "nickypro/tinyllama-15M" lora_weights = "./chkp" share_gradio = True if torch.cuda.is_available(): device = "cuda" else: device = "cpu" try: if torch.backends.mps.is_available(): device = "mps" except: pass prompter = Prompter() tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer") if device == "cuda": model = LlamaForCausalLM.from_pretrained( base_model, load_in_8bit=load_8bit, torch_dtype=torch.float16, device_map="auto", ) model = PeftModel.from_pretrained( model, lora_weights, torch_dtype=torch.float16, device_map={"": 0}, ) elif device == "mps": model = LlamaForCausalLM.from_pretrained( base_model, device_map={"": device}, torch_dtype=torch.float16, ) model = PeftModel.from_pretrained( model, lora_weights, device_map={"": device}, torch_dtype=torch.float16, ) else: model = LlamaForCausalLM.from_pretrained( base_model, device_map={"": device}, low_cpu_mem_usage=True ) model = PeftModel.from_pretrained( model, lora_weights, device_map={"": device}, ) # if not load_8bit: # model.half() model.eval() if torch.__version__ >= "2" and sys.platform != "win32": model = torch.compile(model) def evaluate( instruction, temperature=0.1, top_p=0.75, top_k=40, num_beams=4, max_new_tokens=15, stream_output=True, **kwargs, ): prompt = prompter.generate_prompt(instruction) inputs = tokenizer(prompt, return_tensors="pt") input_ids = inputs["input_ids"].to(device) generation_config = GenerationConfig( temperature=temperature, top_p=top_p, top_k=top_k, num_beams=num_beams, **kwargs, ) generate_params = { "input_ids": input_ids, "generation_config": generation_config, "return_dict_in_generate": True, "output_scores": True, "max_new_tokens": max_new_tokens, } # Without streaming with torch.no_grad(): generation_output = model.generate( input_ids=input_ids, generation_config=generation_config, return_dict_in_generate=True, output_scores=True, max_new_tokens=max_new_tokens, ) s = generation_output.sequences[0] output = tokenizer.decode(s, skip_special_tokens=True).strip() yield prompter.get_response(output) gr.Interface( fn=evaluate, inputs=[ gr.components.Textbox( lines=1, label="Arithmetic", placeholder="What is 63303235 + 20239503", ), gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"), gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"), gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"), gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"), gr.components.Slider( minimum=1, maximum=1024, step=1, value=512, label="Max tokens" ), ], outputs=[ gr.Textbox( lines=5, label="Output", ) ], title="test model", description="Это пример реализации из goat", # noqa: E501 ).queue().launch(share=share_gradio)