from typing import Optional import gradio as gr import torch from peft import PeftModel from transformers import GenerationConfig from transformers import LlamaForCausalLM from transformers import LlamaTokenizer print("starting server ...") BASE_MODEL = "decapoda-research/llama-13b-hf" LORA_WEIGHTS = "izumi-lab/llama-13b-japanese-lora-v0-1ep" tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL) if torch.cuda.is_available(): device = "cuda" else: device = "cpu" try: if torch.backends.mps.is_available(): device = "mps" except Exception: pass if device == "cuda": model = LlamaForCausalLM.from_pretrained( BASE_MODEL, load_in_8bit=True, device_map="auto", ) model = PeftModel.from_pretrained(model, LORA_WEIGHTS, load_in_8bit=True) elif device == "mps": model = LlamaForCausalLM.from_pretrained( BASE_MODEL, device_map={"": device}, load_in_8bit=True ) model = PeftModel.from_pretrained( model, LORA_WEIGHTS, device_map={"": device}, load_in_8bit=True ) else: model = LlamaForCausalLM.from_pretrained( BASE_MODEL, device_map={"": device}, load_in_8bit=True, low_cpu_mem_usage=True ) model = PeftModel.from_pretrained( model, LORA_WEIGHTS, device_map={"": device}, load_in_8bit=True ) def generate_prompt(instruction: str, input: Optional[str] = None): if input: return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Input: {input} ### Response:""" else: return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Response:""" if device != "cpu": model.half() model.eval() if torch.__version__ >= "2": model = torch.compile(model) def evaluate( instruction: str, input: Optional[str] = None, temperature: float = 0.7, top_p: float = 1.0, top_k: int = 40, num_beams: int = 4, max_new_tokens: int = 256, **kwargs, ): prompt = generate_prompt(instruction, input) inputs = tokenizer(prompt, return_tensors="pt") input_ids = inputs["input_ids"].to(device) generation_config = GenerationConfig( temperature=temperature, top_p=top_p, top_k=top_k, num_beams=num_beams, **kwargs, ) with torch.no_grad(): generation_output = model.generate( input_ids=input_ids, generation_config=generation_config, return_dict_in_generate=True, output_scores=True, max_new_tokens=max_new_tokens, ) s = generation_output.sequences[0] output = tokenizer.decode(s) return output.split("### Response:")[1].strip() g = gr.Interface( fn=evaluate, inputs=[ gr.components.Textbox(lines=2, label="Instruction", placeholder="東京から大阪に行くには?"), gr.components.Textbox(lines=2, label="Input", placeholder="none"), gr.components.Slider(minimum=0, maximum=1, value=0.7, label="Temperature"), gr.components.Slider(minimum=0, maximum=1, value=1.0, label="Top p"), gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"), gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"), gr.components.Slider( minimum=1, maximum=512, step=1, value=128, label="Max tokens" ), ], outputs=[ gr.inputs.Textbox( lines=5, label="Output", ) ], title="izumi-lab/calm-7b-lora-v0-1ep", description="izumi-lab/calm-7b-lora-v0-1ep is a 7B-parameter Calm model finetuned to follow instructions. It is trained on the [izumi-lab/llm-japanese-dataset](https://huggingface.co/datasets/izumi-lab/llm-japanese-dataset) dataset and makes use of the Huggingface Calm-7b implementation. For more information, please visit [the project's website](https://llm.msuzuki.me).", ) g.queue(concurrency_count=1) print("loading completed") g.launch(server_name="0.0.0.0", server_port=7860)