Spaces:

izumi-lab
/

llama-13b-japanese-lora-v0-1ep

Paused

masanorihirano commited on May 30, 2023

Commit

bed8c52

•

1 Parent(s): c693f6c

added compress

Files changed (1) hide show

app.py CHANGED Viewed

@@ -17,6 +17,7 @@ from fastchat.model.model_adapter import BaseAdapter
 from fastchat.model.model_adapter import load_model
 from fastchat.model.model_adapter import model_adapters
 from fastchat.serve.cli import SimpleChatIO
 from fastchat.serve.inference import generate_stream
 from huggingface_hub import Repository
 from huggingface_hub import snapshot_download
@@ -70,6 +71,8 @@ def load_lora_model(
         cpu_offloading=cpu_offloading,
         debug=debug,
     )
     if lora_weight is not None:
         # model = PeftModelForCausalLM.from_pretrained(model, model_path, **kwargs)
         config = LoraConfig.from_pretrained(lora_weight)

 from fastchat.model.model_adapter import load_model
 from fastchat.model.model_adapter import model_adapters
 from fastchat.serve.cli import SimpleChatIO
+from fastchat.serve.inference import compress_module
 from fastchat.serve.inference import generate_stream
 from huggingface_hub import Repository
 from huggingface_hub import snapshot_download
         cpu_offloading=cpu_offloading,
         debug=debug,
     )
+    if load_8bit:
+        compress_module(model)
     if lora_weight is not None:
         # model = PeftModelForCausalLM.from_pretrained(model, model_path, **kwargs)
         config = LoraConfig.from_pretrained(lora_weight)