masanorihirano
commited on
Commit
•
bed8c52
1
Parent(s):
c693f6c
added compress
Browse files
app.py
CHANGED
@@ -17,6 +17,7 @@ from fastchat.model.model_adapter import BaseAdapter
|
|
17 |
from fastchat.model.model_adapter import load_model
|
18 |
from fastchat.model.model_adapter import model_adapters
|
19 |
from fastchat.serve.cli import SimpleChatIO
|
|
|
20 |
from fastchat.serve.inference import generate_stream
|
21 |
from huggingface_hub import Repository
|
22 |
from huggingface_hub import snapshot_download
|
@@ -70,6 +71,8 @@ def load_lora_model(
|
|
70 |
cpu_offloading=cpu_offloading,
|
71 |
debug=debug,
|
72 |
)
|
|
|
|
|
73 |
if lora_weight is not None:
|
74 |
# model = PeftModelForCausalLM.from_pretrained(model, model_path, **kwargs)
|
75 |
config = LoraConfig.from_pretrained(lora_weight)
|
|
|
17 |
from fastchat.model.model_adapter import load_model
|
18 |
from fastchat.model.model_adapter import model_adapters
|
19 |
from fastchat.serve.cli import SimpleChatIO
|
20 |
+
from fastchat.serve.inference import compress_module
|
21 |
from fastchat.serve.inference import generate_stream
|
22 |
from huggingface_hub import Repository
|
23 |
from huggingface_hub import snapshot_download
|
|
|
71 |
cpu_offloading=cpu_offloading,
|
72 |
debug=debug,
|
73 |
)
|
74 |
+
if load_8bit:
|
75 |
+
compress_module(model)
|
76 |
if lora_weight is not None:
|
77 |
# model = PeftModelForCausalLM.from_pretrained(model, model_path, **kwargs)
|
78 |
config = LoraConfig.from_pretrained(lora_weight)
|