Spaces:

Weyaxi
/

merge-lora

Running on CPU Upgrade

File size: 3,657 Bytes

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import gc
import gradio as gr
import torch
from huggingface_hub import snapshot_download, HfApi, notebook_login, create_repo, whoami, login

api = HfApi()


def info_fn(text):
    gr.Info(text)


def warning_fn(text):
    gr.Warning(text)


def upload(hf_token, base_model_name_or_path, peft_model_path, output_dir):
    try:
      login(hf_token)
      repo_name = output_dir

      device_arg = {'device_map': "cpu"}

      info_fn(f"Loading base model: {base_model_name_or_path}")

      base_model = AutoModelForCausalLM.from_pretrained(base_model_name_or_path, torch_dtype=torch.bfloat16, **device_arg)

      info_fn(f"Loading PEFT: {peft_model_path}")

      model = PeftModel.from_pretrained(base_model, peft_model_path, **device_arg)

      info_fn(f"Running merge_and_unload")

      model = model.merge_and_unload()
      tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path)

      info_fn("Saving model..")
      model.save_pretrained(output_dir, safe_serialization=True)

      info_fn("Saving tokenizer...")
      tokenizer.save_pretrained(output_dir)

      info_fn(f"Model saved to {output_dir}")

      del model
      gc.collect()

      try:
        info_fn("Creating Repo...")
        info_fn(api.create_repo(repo_id=repo_name).__dict__['url'])
      except Exception as e:
        warning_fn(f"Model already exists: {e}")

      info_fn("Uploading to hub...")
      uploading = api.upload_folder(
          folder_path=output_dir,
          repo_id=output_dir,
          repo_type="model")
      
      return uploading

    except Exception as e:
      gc.collect()
      gr.Error(e)

      return e


INTRODUCTION_TEXT = f"""
🎯 This space allows you to merge your Lora adapters.

## ❓ What is Lora?

LoRA: Low-Rank Adaptation of Large Language Models allows you to train LLM's with a low cost. Lora freezes the pre-trained model weights and injects trainable rank decomposition matrices into each layer of the Transformer architecture, greatly reducing the number of trainable parameters for downstream tasks.
You can learn more about LoRa here:

[📝 LoRA: Low-Rank Adaptation of Large Language Models Arxiv](https://arxiv.org/abs/2106.09685)

## 🛠️ How does this space work?

🛠️ The leaderboard's backend mainly runs the transformers and PEFT library.

🤖 The code first loads your original model and then your adapter models.

📚 The code merges your adapter weights using the `merge_and_unload` function from the PEFT library.

📤 The code saves your resulting model temporarily and then pushes the resulting model to the hub.

## 🧮 Required RAM

This space is loading the model to RAM without performing any quantization, so the required RAM is high.

You can merge models up to 7B. (If your adapter weights are too large, it might not work.)
"""


with gr.Blocks() as demo:
    gr.Markdown("""<h1 align="center" id="space-title">🚀 Lora Merge</h1>""")
    gr.Markdown(INTRODUCTION_TEXT)

    with gr.Row():
      with gr.Column(scale=1):
        hf_token = gr.Textbox(label="Huggingface Write Access Token")
        base_model_name_or_path = gr.Textbox(label="Base Model")
        peft_model_path = gr.Textbox(label="Adapter Model")
        output_dir = gr.Textbox(label="Output Model Name")

      with gr.Column(scale=1):
        text  = gr.Textbox(label="Output Model Name", lines=14)


    submit = gr.Button("Merge lora with adapters")
    submit.click(fn=upload, inputs=[hf_token, base_model_name_or_path, peft_model_path, output_dir], outputs=text)


demo.queue()
demo.launch(show_error=True)