Spaces:
Runtime error
Runtime error
Filip
commited on
Commit
·
fe01251
1
Parent(s):
56d8f41
update
Browse files
app.py
CHANGED
@@ -5,33 +5,42 @@ import gc
|
|
5 |
import os
|
6 |
|
7 |
# Enable better CPU performance
|
8 |
-
torch.set_num_threads(4)
|
9 |
device = "cpu"
|
10 |
|
11 |
def load_model():
|
12 |
model_name = "forestav/unsloth_vision_radiography_finetune"
|
|
|
13 |
|
14 |
-
# Load tokenizer and processor first to free up memory
|
15 |
print("Loading tokenizer and processor...")
|
16 |
-
tokenizer
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
print("Loading model...")
|
20 |
# Load model with CPU optimizations
|
21 |
model = AutoModelForCausalLM.from_pretrained(
|
22 |
model_name,
|
23 |
device_map="cpu",
|
24 |
-
torch_dtype=torch.float32,
|
25 |
low_cpu_mem_usage=True,
|
26 |
-
offload_folder="offload",
|
27 |
-
offload_state_dict=True
|
|
|
28 |
)
|
29 |
|
30 |
-
# Quantize the model for CPU
|
31 |
print("Quantizing model...")
|
32 |
model = torch.quantization.quantize_dynamic(
|
33 |
model,
|
34 |
-
{torch.nn.Linear},
|
35 |
dtype=torch.qint8
|
36 |
)
|
37 |
|
@@ -81,7 +90,7 @@ def analyze_image(image, instruction):
|
|
81 |
min_p=0.1,
|
82 |
use_cache=True,
|
83 |
pad_token_id=tokenizer.eos_token_id,
|
84 |
-
num_beams=1
|
85 |
)
|
86 |
|
87 |
# Decode the response
|
|
|
5 |
import os
|
6 |
|
7 |
# Enable better CPU performance
|
8 |
+
torch.set_num_threads(4)
|
9 |
device = "cpu"
|
10 |
|
11 |
def load_model():
|
12 |
model_name = "forestav/unsloth_vision_radiography_finetune"
|
13 |
+
base_model_name = "unsloth/Llama-3.2-11B-Vision-Instruct" # Correct base model
|
14 |
|
|
|
15 |
print("Loading tokenizer and processor...")
|
16 |
+
# Load tokenizer from base model
|
17 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
18 |
+
base_model_name,
|
19 |
+
trust_remote_code=True
|
20 |
+
)
|
21 |
+
|
22 |
+
# Load processor from base model
|
23 |
+
processor = AutoProcessor.from_pretrained(
|
24 |
+
base_model_name,
|
25 |
+
trust_remote_code=True
|
26 |
+
)
|
27 |
|
28 |
print("Loading model...")
|
29 |
# Load model with CPU optimizations
|
30 |
model = AutoModelForCausalLM.from_pretrained(
|
31 |
model_name,
|
32 |
device_map="cpu",
|
33 |
+
torch_dtype=torch.float32,
|
34 |
low_cpu_mem_usage=True,
|
35 |
+
offload_folder="offload",
|
36 |
+
offload_state_dict=True,
|
37 |
+
trust_remote_code=True
|
38 |
)
|
39 |
|
|
|
40 |
print("Quantizing model...")
|
41 |
model = torch.quantization.quantize_dynamic(
|
42 |
model,
|
43 |
+
{torch.nn.Linear},
|
44 |
dtype=torch.qint8
|
45 |
)
|
46 |
|
|
|
90 |
min_p=0.1,
|
91 |
use_cache=True,
|
92 |
pad_token_id=tokenizer.eos_token_id,
|
93 |
+
num_beams=1
|
94 |
)
|
95 |
|
96 |
# Decode the response
|