''' CREDIT: script adapted from [alpaca](https://huggingface.co/spaces/tloen/alpaca-lora/blob/main/app.py). ''' import gradio as gr import random import time import transformers import os import json import torch import argparse from tqdm import tqdm from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig def apply_delta(base_model_path, target_model_path, delta_path): print(f"Loading the delta weights from {delta_path}") delta_tokenizer = LlamaTokenizer.from_pretrained(delta_path, use_fast=False) delta = LlamaForCausalLM.from_pretrained( delta_path, low_cpu_mem_usage=True, torch_dtype=torch.float16 ) print(f"Loading the base model from {base_model_path}") base_tokenizer = LlamaTokenizer.from_pretrained(base_model_path, use_fast=False) base = LlamaForCausalLM.from_pretrained( base_model_path, low_cpu_mem_usage=True, torch_dtype=torch.float16 ) # following alpaca training recipe, we have added new initialized tokens DEFAULT_PAD_TOKEN = "[PAD]" DEFAULT_EOS_TOKEN = "" DEFAULT_BOS_TOKEN = "" DEFAULT_UNK_TOKEN = "" special_tokens_dict = { "pad_token": DEFAULT_PAD_TOKEN, "eos_token": DEFAULT_EOS_TOKEN, "bos_token": DEFAULT_BOS_TOKEN, "unk_token": DEFAULT_UNK_TOKEN, } num_new_tokens = base_tokenizer.add_special_tokens(special_tokens_dict) base.resize_token_embeddings(len(base_tokenizer)) input_embeddings = base.get_input_embeddings().weight.data output_embeddings = base.get_output_embeddings().weight.data input_embeddings[-num_new_tokens:] = 0 output_embeddings[-num_new_tokens:] = 0 print("Applying the delta") target_weights = {} for name, param in tqdm(base.state_dict().items(), desc="Applying delta"): assert name in delta.state_dict() param.data += delta.state_dict()[name] target_weights[name] = param.data print(f"Saving the target model to {target_model_path}") base.load_state_dict(target_weights) # base.save_pretrained(target_model_path) # delta_tokenizer.save_pretrained(target_model_path) delta = None return base, delta_tokenizer base_weights = 'decapoda-research/llama-7b-hf' target_weights = 'expertllama' # local path delta_weights = 'OFA-Sys/expertllama-7b-delta' model, tokenizer = apply_delta(base_weights, target_weights, delta_weights) model = model.to(torch.float) if torch.__version__ >= "2": model = torch.compile(model) def respond( instruction, temperature=0.1, top_p=0.75, top_k=40, num_beams=4, max_new_tokens=128, **kwargs, ): # prompt wrapper, only single-turn is allowed for now prompt = f"### Human:\n{instruction}\n\n### Assistant:\n" inputs = tokenizer( prompt, return_tensors="pt", add_special_tokens=False ) generation_config = GenerationConfig( temperature=temperature, top_p=top_p, top_k=top_k, num_beams=num_beams, **kwargs, ) with torch.no_grad(): generation_output = model.generate( input_ids=inputs["input_ids"], generation_config=generation_config, return_dict_in_generate=True, output_scores=True, max_new_tokens=max_new_tokens, ) response = tokenizer.decode(generation_output.sequences[0][:-2]).split("### Assistant:\n", 1)[1] return response g = gr.Interface( fn=respond, inputs=[ gr.components.Textbox( lines=2, label="Instruction" ), gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"), gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"), gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"), gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"), gr.components.Slider( minimum=1, maximum=768, step=1, value=512, label="Max tokens" ), ], outputs=[ gr.inputs.Textbox( lines=8, label="Output", ) ], title="ExpertLLaMA", description="ExpertLLaMA is an open-source chatbot trained on expert-like data produced with GPT-3.5, see our [project repo](https://github.com/OFA-Sys/ExpertLLaMA) for details.", ) g.queue(concurrency_count=1) g.launch()