Yong Liu commited on
Commit
dc63702
·
1 Parent(s): fe3660d

update handler

Browse files
Files changed (1) hide show
  1. handler.py +36 -48
handler.py CHANGED
@@ -1,50 +1,37 @@
1
  import os
2
  import json
3
  import torch
4
- from transformers import pipeline, AutoTokenizer, AutoConfig
5
- from typing import Dict, List, Any, Optional, Union
6
- import functools
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  class EndpointHandler:
9
  def __init__(self, path=""):
10
  # Initialize model and tokenizer
11
  self.model_path = path if path else os.environ.get("MODEL_PATH", "")
12
-
13
- # Monkey patch the RoPE scaling validation to bypass the length check
14
- try:
15
- from transformers.models.phi3.configuration_phi3 import Phi3Config
16
- original_validation = Phi3Config._rope_scaling_validation
17
-
18
- # Create a patched version that doesn't validate length
19
- @functools.wraps(original_validation)
20
- def patched_validation(self_config):
21
- # Skip validation if short_factor length is 48
22
- if (hasattr(self_config, "rope_scaling") and
23
- "short_factor" in self_config.rope_scaling and
24
- len(self_config.rope_scaling["short_factor"]) == 48):
25
- print("Bypassing RoPE scaling validation for short_factor of length 48")
26
- return
27
- # Otherwise call the original validation
28
- return original_validation(self_config)
29
-
30
- # Apply the monkey patch
31
- Phi3Config._rope_scaling_validation = patched_validation
32
- print("Successfully patched RoPE scaling validation")
33
- except Exception as e:
34
- print(f"Warning: Could not patch RoPE scaling validation: {str(e)}")
35
 
36
  # Load tokenizer
37
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
38
 
39
- # Create text generation pipeline
40
- self.pipe = pipeline(
41
- "text-generation",
42
- model=self.model_path,
43
- tokenizer=self.tokenizer,
44
  torch_dtype=torch.float16,
45
- device_map="auto",
46
- return_full_text=False # Only return the generated text, not the prompt
47
  )
 
48
 
49
  def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
50
  """Handle inference request in OpenAI-like format"""
@@ -113,36 +100,37 @@ class EndpointHandler:
113
  return prompt
114
 
115
  def _generate(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
116
- """Generate response using the pipeline"""
117
  prompt = inputs["prompt"]
118
  params = inputs["generation_params"]
119
 
 
 
 
120
  # Count input tokens
121
- input_tokens = len(self.tokenizer.encode(prompt))
122
 
123
- # Convert OpenAI-like parameters to pipeline parameters
124
  generation_kwargs = {
125
  "max_new_tokens": params["max_tokens"],
126
  "temperature": params["temperature"],
127
  "top_p": params["top_p"],
128
  "num_return_sequences": params["n"],
129
  "do_sample": params["temperature"] > 0,
 
130
  }
131
 
132
- # Add stopping criteria if provided
133
- if params["stop"]:
134
- generation_kwargs["stopping_criteria"] = params["stop"]
135
-
136
- # Generate output using the pipeline
137
- pipeline_outputs = self.pipe(
138
- prompt,
139
- **generation_kwargs
140
- )
141
 
142
- # Extract generated texts
143
  generated_texts = []
144
- for output in pipeline_outputs:
145
- gen_text = output["generated_text"]
146
 
147
  # Apply stop sequences if provided
148
  if params["stop"]:
 
1
  import os
2
  import json
3
  import torch
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+ from typing import Dict, List, Any
6
+
7
+ # Fix for the rope_scaling validation issue
8
+ import transformers.models.phi3.configuration_phi3
9
+ # Store original method
10
+ original_validation = transformers.models.phi3.configuration_phi3.Phi3Config._rope_scaling_validation
11
+
12
+ # Replace with a no-op function
13
+ def no_validation(self):
14
+ pass
15
+
16
+ # Apply the patch
17
+ transformers.models.phi3.configuration_phi3.Phi3Config._rope_scaling_validation = no_validation
18
 
19
  class EndpointHandler:
20
  def __init__(self, path=""):
21
  # Initialize model and tokenizer
22
  self.model_path = path if path else os.environ.get("MODEL_PATH", "")
23
+ print(f"Loading model from: {self.model_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # Load tokenizer
26
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
27
 
28
+ # Load model directly without pipeline
29
+ self.model = AutoModelForCausalLM.from_pretrained(
30
+ self.model_path,
 
 
31
  torch_dtype=torch.float16,
32
+ device_map="auto"
 
33
  )
34
+ print("Model loaded successfully")
35
 
36
  def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
37
  """Handle inference request in OpenAI-like format"""
 
100
  return prompt
101
 
102
  def _generate(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
103
+ """Generate response using the model directly"""
104
  prompt = inputs["prompt"]
105
  params = inputs["generation_params"]
106
 
107
+ # Tokenize input
108
+ input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.model.device)
109
+
110
  # Count input tokens
111
+ input_tokens = input_ids.shape[1]
112
 
113
+ # Convert OpenAI-like parameters to HF parameters
114
  generation_kwargs = {
115
  "max_new_tokens": params["max_tokens"],
116
  "temperature": params["temperature"],
117
  "top_p": params["top_p"],
118
  "num_return_sequences": params["n"],
119
  "do_sample": params["temperature"] > 0,
120
+ "pad_token_id": self.tokenizer.eos_token_id,
121
  }
122
 
123
+ # Generate output
124
+ with torch.no_grad():
125
+ outputs = self.model.generate(
126
+ input_ids,
127
+ **generation_kwargs
128
+ )
 
 
 
129
 
130
+ # Decode output
131
  generated_texts = []
132
+ for i in range(params["n"]):
133
+ gen_text = self.tokenizer.decode(outputs[i][input_tokens:], skip_special_tokens=True)
134
 
135
  # Apply stop sequences if provided
136
  if params["stop"]: