Tonic commited on
Commit
55d7c97
Β·
1 Parent(s): a3113ce

adds float32 defaults for quantized model tensors

Browse files
Files changed (1) hide show
  1. app.py +50 -68
app.py CHANGED
@@ -10,6 +10,9 @@ import os
10
  import sys
11
  import requests
12
 
 
 
 
13
  # Configure logging
14
  logging.basicConfig(level=logging.INFO)
15
  logger = logging.getLogger(__name__)
@@ -73,49 +76,6 @@ def download_chat_template():
73
  logger.error(f"Unexpected error downloading chat template: {e}")
74
  return None
75
 
76
- def get_fallback_chat_template():
77
- """Return a fallback chat template if download fails"""
78
- return """{# ───── defaults ───── #}
79
- {%- if enable_thinking is not defined -%}
80
- {%- set enable_thinking = true -%}
81
- {%- endif -%}
82
-
83
- {# ───── reasoning mode ───── #}
84
- {%- if enable_thinking -%}
85
- {%- set reasoning_mode = "/think" -%}
86
- {%- else -%}
87
- {%- set reasoning_mode = "/no_think" -%}
88
- {%- endif -%}
89
-
90
- {# ───── header (system message) ───── #}
91
- {{- "<|im_start|>system\\n" -}}
92
- {{- system_message | trim -}}
93
- {{- "<|im_end|>\\n" -}}
94
-
95
- {# ───── conversation history ───── #}
96
- {%- for message in messages -%}
97
- {%- set content = message.content | trim -%}
98
- {%- if message.role == "user" -%}
99
- {{ "<|im_start|>user\\n" + content + "<|im_end|>\\n" }}
100
- {%- elif message.role == "assistant" -%}
101
- {%- if content.startswith("<think>") and content.endswith("</think>") -%}
102
- {{ "<|im_start|>assistant\\n" + content + "<|im_end|>\\n" }}
103
- {%- else -%}
104
- {{ "<|im_start|>assistant\\n" + "<think>\\n\\n</think>\\n" + content.lstrip("\\n") + "<|im_end|>\\n" }}
105
- {%- endif -%}
106
- {%- elif message.role == "tool" -%}
107
- {{ "<|im_start|>" + "user\\n" + content + "<|im_end|>\\n" }}
108
- {%- endif -%}
109
- {%- endfor -%}
110
-
111
- {# ───── generation prompt ───── #}
112
- {%- if add_generation_prompt -%}
113
- {%- if reasoning_mode == "/think" -%}
114
- {{ "<|im_start|>assistant\\n" }}
115
- {%- else -%}
116
- {{ "<|im_start|>assistant\\n" + "<think>\\n\\n</think>\\n" }}
117
- {%- endif -%}
118
- {%- endif -%}"""
119
 
120
  def load_model():
121
  """Load the model and tokenizer"""
@@ -128,24 +88,23 @@ def load_model():
128
 
129
  # Download and set the chat template
130
  chat_template = download_chat_template()
131
- if chat_template:
132
- tokenizer.chat_template = chat_template
133
- logger.info("Chat template downloaded and set successfully")
134
- else:
135
- # Use fallback chat template
136
- logger.warning("Failed to download chat template, using fallback")
137
- tokenizer.chat_template = get_fallback_chat_template()
138
- logger.info("Fallback chat template set successfully")
139
 
140
  # Load the int4 model from local path
141
  logger.info(f"Loading int4 model from {MAIN_MODEL_ID}")
142
- model = AutoModelForCausalLM.from_pretrained(
143
- MAIN_MODEL_ID,
144
- subfolder="int4",
145
- device_map="auto" if DEVICE == "cuda" else "cpu",
146
- torch_dtype=torch.bfloat16,
147
- trust_remote_code=True
148
- )
 
 
 
 
149
 
150
  if tokenizer.pad_token_id is None:
151
  tokenizer.pad_token_id = tokenizer.eos_token_id
@@ -155,6 +114,7 @@ def load_model():
155
 
156
  except Exception as e:
157
  logger.error(f"Error loading model: {e}")
 
158
  return False
159
 
160
 
@@ -207,22 +167,44 @@ def generate_response(message, history, system_message, max_tokens, temperature,
207
  # Tokenize the input
208
  inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
209
 
 
 
 
210
  # Move to device
211
  if DEVICE == "cuda":
212
  inputs = {k: v.cuda() for k, v in inputs.items()}
213
 
214
  # Generate response
215
  with torch.no_grad():
216
- output_ids = model.generate(
217
- inputs['input_ids'],
218
- max_new_tokens=max_tokens,
219
- temperature=temperature,
220
- top_p=top_p,
221
- do_sample=do_sample,
222
- attention_mask=inputs['attention_mask'],
223
- pad_token_id=tokenizer.eos_token_id,
224
- eos_token_id=tokenizer.eos_token_id
225
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  # Decode the response
228
  response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
10
  import sys
11
  import requests
12
 
13
+ # Set torch to use float32 for better compatibility with quantized models
14
+ torch.set_default_dtype(torch.float32)
15
+
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
 
76
  logger.error(f"Unexpected error downloading chat template: {e}")
77
  return None
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  def load_model():
81
  """Load the model and tokenizer"""
 
88
 
89
  # Download and set the chat template
90
  chat_template = download_chat_template()
91
+ tokenizer.chat_template = chat_template
92
+ logger.info("Chat template downloaded and set successfully")
93
+
 
 
 
 
 
94
 
95
  # Load the int4 model from local path
96
  logger.info(f"Loading int4 model from {MAIN_MODEL_ID}")
97
+
98
+ # Configure model loading parameters for int4 quantization
99
+ model_kwargs = {
100
+ "device_map": "auto" if DEVICE == "cuda" else "cpu",
101
+ "torch_dtype": torch.float32, # Use float32 for int4 quantized models
102
+ "trust_remote_code": True,
103
+ "low_cpu_mem_usage": True, # Help with memory management
104
+ }
105
+
106
+ logger.info(f"Model loading parameters: {model_kwargs}")
107
+ model = AutoModelForCausalLM.from_pretrained(MAIN_MODEL_ID, subfolder="int4", **model_kwargs)
108
 
109
  if tokenizer.pad_token_id is None:
110
  tokenizer.pad_token_id = tokenizer.eos_token_id
 
114
 
115
  except Exception as e:
116
  logger.error(f"Error loading model: {e}")
117
+ logger.error(f"Model config: {model.config if model else 'Model not loaded'}")
118
  return False
119
 
120
 
 
167
  # Tokenize the input
168
  inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
169
 
170
+ # Debug input tensor information
171
+ logger.info(f"Input tensor shapes: {[(k, v.shape, v.dtype) for k, v in inputs.items()]}")
172
+
173
  # Move to device
174
  if DEVICE == "cuda":
175
  inputs = {k: v.cuda() for k, v in inputs.items()}
176
 
177
  # Generate response
178
  with torch.no_grad():
179
+ try:
180
+ output_ids = model.generate(
181
+ inputs['input_ids'],
182
+ max_new_tokens=max_tokens,
183
+ temperature=temperature,
184
+ top_p=top_p,
185
+ do_sample=do_sample,
186
+ attention_mask=inputs['attention_mask'],
187
+ pad_token_id=tokenizer.eos_token_id,
188
+ eos_token_id=tokenizer.eos_token_id
189
+ )
190
+ except RuntimeError as e:
191
+ if "expected scalar type" in str(e):
192
+ logger.error(f"Data type mismatch error: {e}")
193
+ # Try with explicit dtype conversion
194
+ inputs['input_ids'] = inputs['input_ids'].to(torch.int64)
195
+ inputs['attention_mask'] = inputs['attention_mask'].to(torch.int64)
196
+ output_ids = model.generate(
197
+ inputs['input_ids'],
198
+ max_new_tokens=max_tokens,
199
+ temperature=temperature,
200
+ top_p=top_p,
201
+ do_sample=do_sample,
202
+ attention_mask=inputs['attention_mask'],
203
+ pad_token_id=tokenizer.eos_token_id,
204
+ eos_token_id=tokenizer.eos_token_id
205
+ )
206
+ else:
207
+ raise e
208
 
209
  # Decode the response
210
  response = tokenizer.decode(output_ids[0], skip_special_tokens=True)