Brain-LLM
/

phi4-mini-raw

Safetensors

phi3

custom_code

Model card Files Files and versions

xet

Community

yol146 commited on Apr 12, 2025

Commit

fd19926

1 Parent(s): 290cf25

modify the handler

Browse files

Files changed (1) hide show

handler.py +83 -10

handler.py CHANGED Viewed

@@ -104,23 +104,96 @@ class EndpointHandler:
             do_sample = parameters.get("do_sample", self.do_sample)
             stream = parameters.get("stream", False)
-            # Tokenize the input safely
-            inputs = self.tokenizer(prompt, return_tensors="pt")
-            logger.info(f"Input tokens shape: {inputs['input_ids'].shape}")
-            # Move to device
-            inputs = {k: v.to(self.device) for k, v in inputs.items()}
-            # Handle streaming if requested
-            if stream:
-                return self._generate_stream(inputs, max_new_tokens, temperature, top_p, do_sample)
-            else:
-                return self._generate(inputs, max_new_tokens, temperature, top_p, do_sample)
         except Exception as e:
             logger.error(f"Error during generation: {e}")
             return {"error": str(e)}
     def _generate(self, inputs, max_new_tokens, temperature, top_p, do_sample):
         """Generate text non-streaming mode"""
         try:

             do_sample = parameters.get("do_sample", self.do_sample)
             stream = parameters.get("stream", False)
+            # CRITICAL FIX: Use manual generation approach for Phi models with vocabulary mismatches
+            # This bypasses the token indexing issues
+            if stream:
+                return {"error": "Streaming temporarily disabled while fixing token indexing issues"}
+            # Manually implement generation to avoid token index errors
+            input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
+            logger.info(f"Input tokens shape: {input_ids.shape}")
+            # Create attention mask
+            attention_mask = torch.ones_like(input_ids)
+            # Perform safe generation with error handling for out-of-vocabulary issues
+            return self._safe_generate(input_ids, attention_mask, max_new_tokens, temperature, top_p, do_sample)
         except Exception as e:
             logger.error(f"Error during generation: {e}")
             return {"error": str(e)}
+    def _safe_generate(self, input_ids, attention_mask, max_new_tokens, temperature, top_p, do_sample):
+        """Safely generate text handling potential token index errors"""
+        try:
+            with torch.no_grad():
+                # Get the input text to exclude from final output
+                input_text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
+                logger.info(f"Input decoded text: '{input_text}'")
+                # Generate one token at a time to avoid index errors
+                max_steps = min(max_new_tokens, 100)  # Limit to 100 tokens for testing
+                current_ids = input_ids.clone()
+                for _ in range(max_steps):
+                    # Get logits for next token
+                    outputs = self.model(
+                        input_ids=current_ids,
+                        attention_mask=attention_mask,
+                        return_dict=True
+                    )
+                    next_token_logits = outputs.logits[:, -1, :]
+                    # Apply temperature and sampling
+                    if temperature > 0:
+                        next_token_logits = next_token_logits / temperature
+                    if do_sample:
+                        # Apply top_p sampling
+                        sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
+                        cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+                        # Remove tokens with cumulative probability above the threshold
+                        sorted_indices_to_remove = cumulative_probs > top_p
+                        # Shift the indices to the right to keep also the first token above the threshold
+                        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                        sorted_indices_to_remove[..., 0] = 0
+                        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+                        next_token_logits[indices_to_remove] = -float('Inf')
+                        # Sample from the filtered distribution
+                        probs = torch.softmax(next_token_logits, dim=-1)
+                        next_token = torch.multinomial(probs, num_samples=1)
+                    else:
+                        # Take the token with highest probability
+                        next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+                    # Add the predicted token to the sequence
+                    current_ids = torch.cat([current_ids, next_token], dim=-1)
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(next_token)], dim=-1)
+                    # Check if we've generated an EOS token
+                    if next_token[0, 0].item() == self.tokenizer.eos_token_id:
+                        break
+                # Decode the generated sequence
+                generated_text = self.tokenizer.decode(current_ids[0], skip_special_tokens=True)
+                # Return only the newly generated text (without the prompt)
+                if generated_text.startswith(input_text):
+                    response_text = generated_text[len(input_text):]
+                else:
+                    response_text = generated_text
+                logger.info(f"Generated {len(response_text)} characters")
+                return {"generated_text": response_text}
+        except Exception as e:
+            logger.error(f"Error in _safe_generate: {str(e)}")
+            return {"error": f"Generation error: {str(e)}. Please try a simpler input."}
     def _generate(self, inputs, max_new_tokens, temperature, top_p, do_sample):
         """Generate text non-streaming mode"""
         try: