import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig from peft import PeftModel, PeftConfig # Set the model name and load the tokenizer and configuration for the model MODEL_NAME = "IlyaGusev/llama_7b_ru_turbo_alpaca_lora" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) config = PeftConfig.from_pretrained(MODEL_NAME) # Load the model and set it to evaluation mode model = AutoModelForCausalLM.from_pretrained( config.base_model_name_or_path, load_in_8bit=True, device_map="auto" ) model = PeftModel.from_pretrained(model, MODEL_NAME) model.eval() # Define a function to generate a prompt based on the user's input def generate_prompt(instruction, input=None): if input: return f"Task: {instruction}\nInput: {input}\nOutput:" return f"Task: {instruction}\n\nOutput:" # Define a function to evaluate the user's input and generate text based on it def evaluate( instruction, input=None, temperature=1.0, top_p=1.0, top_k=40, num_beams=3, max_new_tokens=256, **kwargs, ): prompt = generate_prompt(instruction, input) inputs = tokenizer(prompt, return_tensors="pt") input_ids = inputs["input_ids"].to(model.device) generation_config = GenerationConfig( temperature=temperature, top_p=top_p, top_k=top_k, num_beams=num_beams, **kwargs, ) with torch.no_grad(): generation_output = model.generate( input_ids=input_ids, generation_config=generation_config, return_dict_in_generate=True, output_scores=True, max_new_tokens=max_new_tokens ) s = generation_output.sequences[0] output = tokenizer.decode(s, skip_special_tokens=True) return output.strip() # Set up a Gradio interface for the evaluation function g = gr.Interface( fn=evaluate, inputs=[ gr.components.Textbox( lines=2, label="Task", placeholder="Why is grass green?" ), gr.components.Textbox(lines=2, label="Input", placeholder="None"), gr.components.Slider(minimum=0, maximum=2, value=1.0, label="Temperature"), gr.components.Slider(minimum=0, maximum=1, value=0.8, label="Top p"), gr.components.Slider(minimum=0, maximum=100, value=40, label="Top k"), gr.components.Slider(minimum=1, maximum=5, step=1, value=4, label="Beams"), gr.components.Slider( minimum=1, maximum=256, step=1, value=256, label="Max tokens" ), ], outputs=[ gr.inputs.Textbox( lines=5, label="Output", ) ], title="LLaMA 7B Ru Turbo Alpaca", description="", ) # Queue the Gradio interface and launch it g.queue(concurrency_count=1) g.launch()