minhdang commited on
Commit
5834081
1 Parent(s): 8c2b2a9

Update inference.py

Browse files
Files changed (1) hide show
  1. inference.py +5 -2
inference.py CHANGED
@@ -34,15 +34,18 @@ from deepseek_vl.models import MultiModalityCausalLM, VLChatProcessor
34
  from deepseek_vl.utils.conversation import Conversation
35
 
36
  from transformers import BitsAndBytesConfig
 
37
 
 
38
 
39
  def load_model(model_path):
40
  vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
41
  tokenizer = vl_chat_processor.tokenizer
42
  vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
43
- model_path, trust_remote_code=True, load_in_8bit=True,low_cpu_mem_usage=True,device_map="auto"
44
  )
45
- vl_gpt = vl_gpt.eval()
 
46
  return tokenizer, vl_gpt, vl_chat_processor
47
 
48
 
 
34
  from deepseek_vl.utils.conversation import Conversation
35
 
36
  from transformers import BitsAndBytesConfig
37
+ from transformers import QuantoConfig
38
 
39
+ quanto_config = QuantoConfig(weights="int8")
40
 
41
  def load_model(model_path):
42
  vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
43
  tokenizer = vl_chat_processor.tokenizer
44
  vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
45
+ model_path, trust_remote_code=True, quantization_config = quanto_config,low_cpu_mem_usage=True
46
  )
47
+
48
+ vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
49
  return tokenizer, vl_gpt, vl_chat_processor
50
 
51