Filip commited on
Commit
fe01251
·
1 Parent(s): 56d8f41
Files changed (1) hide show
  1. app.py +19 -10
app.py CHANGED
@@ -5,33 +5,42 @@ import gc
5
  import os
6
 
7
  # Enable better CPU performance
8
- torch.set_num_threads(4) # Adjust based on available CPU cores
9
  device = "cpu"
10
 
11
  def load_model():
12
  model_name = "forestav/unsloth_vision_radiography_finetune"
 
13
 
14
- # Load tokenizer and processor first to free up memory
15
  print("Loading tokenizer and processor...")
16
- tokenizer = AutoTokenizer.from_pretrained(model_name)
17
- processor = AutoProcessor.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
18
 
19
  print("Loading model...")
20
  # Load model with CPU optimizations
21
  model = AutoModelForCausalLM.from_pretrained(
22
  model_name,
23
  device_map="cpu",
24
- torch_dtype=torch.float32, # Use float32 for CPU
25
  low_cpu_mem_usage=True,
26
- offload_folder="offload", # Enable disk offloading
27
- offload_state_dict=True # Offload state dict to disk
 
28
  )
29
 
30
- # Quantize the model for CPU
31
  print("Quantizing model...")
32
  model = torch.quantization.quantize_dynamic(
33
  model,
34
- {torch.nn.Linear}, # Quantize linear layers
35
  dtype=torch.qint8
36
  )
37
 
@@ -81,7 +90,7 @@ def analyze_image(image, instruction):
81
  min_p=0.1,
82
  use_cache=True,
83
  pad_token_id=tokenizer.eos_token_id,
84
- num_beams=1 # Reduce beam search to save memory
85
  )
86
 
87
  # Decode the response
 
5
  import os
6
 
7
  # Enable better CPU performance
8
+ torch.set_num_threads(4)
9
  device = "cpu"
10
 
11
  def load_model():
12
  model_name = "forestav/unsloth_vision_radiography_finetune"
13
+ base_model_name = "unsloth/Llama-3.2-11B-Vision-Instruct" # Correct base model
14
 
 
15
  print("Loading tokenizer and processor...")
16
+ # Load tokenizer from base model
17
+ tokenizer = AutoTokenizer.from_pretrained(
18
+ base_model_name,
19
+ trust_remote_code=True
20
+ )
21
+
22
+ # Load processor from base model
23
+ processor = AutoProcessor.from_pretrained(
24
+ base_model_name,
25
+ trust_remote_code=True
26
+ )
27
 
28
  print("Loading model...")
29
  # Load model with CPU optimizations
30
  model = AutoModelForCausalLM.from_pretrained(
31
  model_name,
32
  device_map="cpu",
33
+ torch_dtype=torch.float32,
34
  low_cpu_mem_usage=True,
35
+ offload_folder="offload",
36
+ offload_state_dict=True,
37
+ trust_remote_code=True
38
  )
39
 
 
40
  print("Quantizing model...")
41
  model = torch.quantization.quantize_dynamic(
42
  model,
43
+ {torch.nn.Linear},
44
  dtype=torch.qint8
45
  )
46
 
 
90
  min_p=0.1,
91
  use_cache=True,
92
  pad_token_id=tokenizer.eos_token_id,
93
+ num_beams=1
94
  )
95
 
96
  # Decode the response