Spaces:

Tonic
/

Petite-LLM-3

Running on Zero

App Files Files Community

Tonic commited on Jul 29

Commit

55d7c97

1 Parent(s): a3113ce

adds float32 defaults for quantized model tensors

Browse files

Files changed (1) hide show

app.py +50 -68

app.py CHANGED Viewed

@@ -10,6 +10,9 @@ import os
 import sys
 import requests
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -73,49 +76,6 @@ def download_chat_template():
         logger.error(f"Unexpected error downloading chat template: {e}")
         return None
-def get_fallback_chat_template():
-    """Return a fallback chat template if download fails"""
-    return """{# ───── defaults ───── #}
-{%- if enable_thinking is not defined -%}
-{%- set enable_thinking = true -%}
-{%- endif -%}
-{# ───── reasoning mode ───── #}
-{%- if enable_thinking -%}
-{%- set reasoning_mode = "/think" -%}
-{%- else -%}
-{%- set reasoning_mode = "/no_think" -%}
-{%- endif -%}
-{# ───── header (system message) ───── #}
-{{- "<|im_start|>system\\n" -}}
-{{- system_message | trim -}}
-{{- "<|im_end|>\\n" -}}
-{# ───── conversation history ───── #}
-{%- for message in messages -%}
-{%- set content = message.content | trim -%}
-{%- if message.role == "user" -%}
-{{ "<|im_start|>user\\n" + content + "<|im_end|>\\n" }}
-{%- elif message.role == "assistant" -%}
-{%- if content.startswith("<think>") and content.endswith("</think>") -%}
-{{ "<|im_start|>assistant\\n" + content + "<|im_end|>\\n" }}
-{%- else -%}
-{{ "<|im_start|>assistant\\n" + "<think>\\n\\n</think>\\n" + content.lstrip("\\n") + "<|im_end|>\\n" }}
-{%- endif -%}
-{%- elif message.role == "tool" -%}
-{{ "<|im_start|>" + "user\\n" + content + "<|im_end|>\\n" }}
-{%- endif -%}
-{%- endfor -%}
-{# ───── generation prompt ───── #}
-{%- if add_generation_prompt -%}
-{%- if reasoning_mode == "/think" -%}
-{{ "<|im_start|>assistant\\n" }}
-{%- else -%}
-{{ "<|im_start|>assistant\\n" + "<think>\\n\\n</think>\\n" }}
-{%- endif -%}
-{%- endif -%}"""
 def load_model():
     """Load the model and tokenizer"""
@@ -128,24 +88,23 @@ def load_model():
         # Download and set the chat template
         chat_template = download_chat_template()
-        if chat_template:
-            tokenizer.chat_template = chat_template
-            logger.info("Chat template downloaded and set successfully")
-        else:
-            # Use fallback chat template
-            logger.warning("Failed to download chat template, using fallback")
-            tokenizer.chat_template = get_fallback_chat_template()
-            logger.info("Fallback chat template set successfully")
         # Load the int4 model from local path
         logger.info(f"Loading int4 model from {MAIN_MODEL_ID}")
-        model = AutoModelForCausalLM.from_pretrained(
-            MAIN_MODEL_ID,
-            subfolder="int4",
-            device_map="auto" if DEVICE == "cuda" else "cpu",
-            torch_dtype=torch.bfloat16,
-            trust_remote_code=True
-        )
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token_id = tokenizer.eos_token_id
@@ -155,6 +114,7 @@ def load_model():
     except Exception as e:
         logger.error(f"Error loading model: {e}")
         return False
@@ -207,22 +167,44 @@ def generate_response(message, history, system_message, max_tokens, temperature,
         # Tokenize the input
         inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
         # Move to device
         if DEVICE == "cuda":
             inputs = {k: v.cuda() for k, v in inputs.items()}
         # Generate response
         with torch.no_grad():
-            output_ids = model.generate(
-                inputs['input_ids'],
-                max_new_tokens=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                do_sample=do_sample,
-                attention_mask=inputs['attention_mask'],
-                pad_token_id=tokenizer.eos_token_id,
-                eos_token_id=tokenizer.eos_token_id
-            )
         # Decode the response
         response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

 import sys
 import requests
+# Set torch to use float32 for better compatibility with quantized models
+torch.set_default_dtype(torch.float32)
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
         logger.error(f"Unexpected error downloading chat template: {e}")
         return None
 def load_model():
     """Load the model and tokenizer"""
         # Download and set the chat template
         chat_template = download_chat_template()
+        tokenizer.chat_template = chat_template
+        logger.info("Chat template downloaded and set successfully")
         # Load the int4 model from local path
         logger.info(f"Loading int4 model from {MAIN_MODEL_ID}")
+        # Configure model loading parameters for int4 quantization
+        model_kwargs = {
+            "device_map": "auto" if DEVICE == "cuda" else "cpu",
+            "torch_dtype": torch.float32,  # Use float32 for int4 quantized models
+            "trust_remote_code": True,
+            "low_cpu_mem_usage": True,  # Help with memory management
+        }
+        logger.info(f"Model loading parameters: {model_kwargs}")
+        model = AutoModelForCausalLM.from_pretrained(MAIN_MODEL_ID, subfolder="int4", **model_kwargs)
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token_id = tokenizer.eos_token_id
     except Exception as e:
         logger.error(f"Error loading model: {e}")
+        logger.error(f"Model config: {model.config if model else 'Model not loaded'}")
         return False
         # Tokenize the input
         inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
+        # Debug input tensor information
+        logger.info(f"Input tensor shapes: {[(k, v.shape, v.dtype) for k, v in inputs.items()]}")
         # Move to device
         if DEVICE == "cuda":
             inputs = {k: v.cuda() for k, v in inputs.items()}
         # Generate response
         with torch.no_grad():
+            try:
+                output_ids = model.generate(
+                    inputs['input_ids'],
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    do_sample=do_sample,
+                    attention_mask=inputs['attention_mask'],
+                    pad_token_id=tokenizer.eos_token_id,
+                    eos_token_id=tokenizer.eos_token_id
+                )
+            except RuntimeError as e:
+                if "expected scalar type" in str(e):
+                    logger.error(f"Data type mismatch error: {e}")
+                    # Try with explicit dtype conversion
+                    inputs['input_ids'] = inputs['input_ids'].to(torch.int64)
+                    inputs['attention_mask'] = inputs['attention_mask'].to(torch.int64)
+                    output_ids = model.generate(
+                        inputs['input_ids'],
+                        max_new_tokens=max_tokens,
+                        temperature=temperature,
+                        top_p=top_p,
+                        do_sample=do_sample,
+                        attention_mask=inputs['attention_mask'],
+                        pad_token_id=tokenizer.eos_token_id,
+                        eos_token_id=tokenizer.eos_token_id
+                    )
+                else:
+                    raise e
         # Decode the response
         response = tokenizer.decode(output_ids[0], skip_special_tokens=True)