Brain-LLM
/

phi4-mini-raw

Safetensors

phi3

custom_code

Model card Files Files and versions

xet

Community

Yong Liu commited on Apr 18, 2025

Commit

dc63702

1 Parent(s): fe3660d

update handler

Browse files

Files changed (1) hide show

handler.py +36 -48

handler.py CHANGED Viewed

@@ -1,50 +1,37 @@
 import os
 import json
 import torch
-from transformers import pipeline, AutoTokenizer, AutoConfig
-from typing import Dict, List, Any, Optional, Union
-import functools
 class EndpointHandler:
     def __init__(self, path=""):
         # Initialize model and tokenizer
         self.model_path = path if path else os.environ.get("MODEL_PATH", "")
-        # Monkey patch the RoPE scaling validation to bypass the length check
-        try:
-            from transformers.models.phi3.configuration_phi3 import Phi3Config
-            original_validation = Phi3Config._rope_scaling_validation
-            # Create a patched version that doesn't validate length
-            @functools.wraps(original_validation)
-            def patched_validation(self_config):
-                # Skip validation if short_factor length is 48
-                if (hasattr(self_config, "rope_scaling") and
-                    "short_factor" in self_config.rope_scaling and
-                    len(self_config.rope_scaling["short_factor"]) == 48):
-                    print("Bypassing RoPE scaling validation for short_factor of length 48")
-                    return
-                # Otherwise call the original validation
-                return original_validation(self_config)
-            # Apply the monkey patch
-            Phi3Config._rope_scaling_validation = patched_validation
-            print("Successfully patched RoPE scaling validation")
-        except Exception as e:
-            print(f"Warning: Could not patch RoPE scaling validation: {str(e)}")
         # Load tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
-        # Create text generation pipeline
-        self.pipe = pipeline(
-            "text-generation",
-            model=self.model_path,
-            tokenizer=self.tokenizer,
             torch_dtype=torch.float16,
-            device_map="auto",
-            return_full_text=False  # Only return the generated text, not the prompt
         )
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """Handle inference request in OpenAI-like format"""
@@ -113,36 +100,37 @@ class EndpointHandler:
         return prompt
     def _generate(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        """Generate response using the pipeline"""
         prompt = inputs["prompt"]
         params = inputs["generation_params"]
         # Count input tokens
-        input_tokens = len(self.tokenizer.encode(prompt))
-        # Convert OpenAI-like parameters to pipeline parameters
         generation_kwargs = {
             "max_new_tokens": params["max_tokens"],
             "temperature": params["temperature"],
             "top_p": params["top_p"],
             "num_return_sequences": params["n"],
             "do_sample": params["temperature"] > 0,
         }
-        # Add stopping criteria if provided
-        if params["stop"]:
-            generation_kwargs["stopping_criteria"] = params["stop"]
-        # Generate output using the pipeline
-        pipeline_outputs = self.pipe(
-            prompt,
-            **generation_kwargs
-        )
-        # Extract generated texts
         generated_texts = []
-        for output in pipeline_outputs:
-            gen_text = output["generated_text"]
             # Apply stop sequences if provided
             if params["stop"]:

 import os
 import json
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from typing import Dict, List, Any
+# Fix for the rope_scaling validation issue
+import transformers.models.phi3.configuration_phi3
+# Store original method
+original_validation = transformers.models.phi3.configuration_phi3.Phi3Config._rope_scaling_validation
+# Replace with a no-op function
+def no_validation(self):
+    pass
+# Apply the patch
+transformers.models.phi3.configuration_phi3.Phi3Config._rope_scaling_validation = no_validation
 class EndpointHandler:
     def __init__(self, path=""):
         # Initialize model and tokenizer
         self.model_path = path if path else os.environ.get("MODEL_PATH", "")
+        print(f"Loading model from: {self.model_path}")
         # Load tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+        # Load model directly without pipeline
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_path,
             torch_dtype=torch.float16,
+            device_map="auto"
         )
+        print("Model loaded successfully")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """Handle inference request in OpenAI-like format"""
         return prompt
     def _generate(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """Generate response using the model directly"""
         prompt = inputs["prompt"]
         params = inputs["generation_params"]
+        # Tokenize input
+        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.model.device)
         # Count input tokens
+        input_tokens = input_ids.shape[1]
+        # Convert OpenAI-like parameters to HF parameters
         generation_kwargs = {
             "max_new_tokens": params["max_tokens"],
             "temperature": params["temperature"],
             "top_p": params["top_p"],
             "num_return_sequences": params["n"],
             "do_sample": params["temperature"] > 0,
+            "pad_token_id": self.tokenizer.eos_token_id,
         }
+        # Generate output
+        with torch.no_grad():
+            outputs = self.model.generate(
+                input_ids,
+                **generation_kwargs
+            )
+        # Decode output
         generated_texts = []
+        for i in range(params["n"]):
+            gen_text = self.tokenizer.decode(outputs[i][input_tokens:], skip_special_tokens=True)
             # Apply stop sequences if provided
             if params["stop"]: