Update handler.py

9853ed9 verified 6 months ago

11.5 kB

	"""
	Custom Handler for QwenStem-7b on Hugging Face Endpoints
	Handles both text and multimodal (text+image) inputs
	"""

	import torch
	import base64
	import logging
	from io import BytesIO
	from typing import Dict, List, Any, Optional
	from PIL import Image
	from transformers import AutoProcessor, AutoModelForVision2Seq

	# Configuration du logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class EndpointHandler:
	def __init__(self, path=""):
	"""
	Initialize the model handler for HF Endpoints
	Args:
	path: Path to the model directory (provided by HF Endpoints)
	"""
	logger.info(f"Initializing model from path: {path}")

	# Détection du device disponible
	if torch.cuda.is_available():
	self.device = torch.device("cuda")
	logger.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
	else:
	self.device = torch.device("cpu")
	logger.info("Using CPU")

	try:
	# Chargement du processor
	logger.info("Loading processor...")
	self.processor = AutoProcessor.from_pretrained(
	path if path else "analist/QwenStem-7b",
	trust_remote_code=True
	)

	# Chargement du modèle SANS quantification pour HF Endpoints
	# La quantification sera gérée par l'infrastructure si nécessaire
	logger.info("Loading model...")
	self.model = AutoModelForVision2Seq.from_pretrained(
	path if path else "analist/QwenStem-7b",
	trust_remote_code=True,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	low_cpu_mem_usage=True
	).to(self.device)

	# Mise en mode évaluation
	self.model.eval()

	logger.info("Model loaded successfully!")

	except Exception as e:
	logger.error(f"Error loading model: {str(e)}")
	raise

	# Configuration de génération par défaut
	self.default_generation_config = {
	"max_new_tokens": 9192 * 10,
	"temperature": 0.7,
	"top_p": 0.9,
	"do_sample": True,
	"repetition_penalty": 1.05
	}

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	Process incoming request for HF Endpoints

	Args:
	data: Dictionary containing:
	- inputs: Text prompt (str) or dict with 'text' and optionally 'image'
	- parameters: Optional generation parameters (dict)

	Returns:
	List with response dictionary
	"""
	try:
	# Extraction des données
	inputs = data.get("inputs", "")
	parameters = data.get("parameters", {})

	# Log de la requête
	logger.info(f"Processing request - Input type: {type(inputs)}")

	# Fusion des paramètres
	gen_config = {self.default_generation_config, parameters}

	# Traitement selon le type d'entrée
	if isinstance(inputs, dict):
	# Format structuré
	text = inputs.get("text", "")
	image_data = inputs.get("image", None)

	if image_data:
	logger.info("Processing multimodal input (text + image)")
	response = self._process_multimodal(text, image_data, gen_config)
	else:
	logger.info("Processing text-only input from dict")
	response = self._process_text(text, gen_config)

	elif isinstance(inputs, str):
	# Texte simple
	logger.info("Processing text-only input")
	response = self._process_text(inputs, gen_config)

	else:
	raise ValueError(f"Unsupported input type: {type(inputs)}")

	return [{"generated_text": response}]

	except Exception as e:
	logger.error(f"Error during inference: {str(e)}")
	return [{"error": str(e), "error_type": type(e).__name__}]

	def _process_text(self, text: str, config: dict) -> str:
	"""
	Process text-only input
	"""
	if not text:
	raise ValueError("Empty text input")

	# Construction des messages
	messages = [
	{"role": "user", "content": [
	{"type": "text", "text": text}
	]}
	]

	# Application du template
	text_inputs = self.processor.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_tensors="pt"
	).to(self.device)

	# Génération
	with torch.no_grad():
	outputs = self.model.generate(
	text_inputs,
	max_new_tokens=config.get("max_new_tokens", 9192 * 10),
	temperature=config.get("temperature", 0.7),
	top_p=config.get("top_p", 0.9),
	do_sample=config.get("do_sample", True),
	repetition_penalty=config.get("repetition_penalty", 1.05),
	pad_token_id=self.processor.tokenizer.eos_token_id,
	eos_token_id=self.processor.tokenizer.eos_token_id
	)

	# Décodage de la réponse
	full_response = self.processor.decode(outputs[0], skip_special_tokens=True)

	# Extraction de la partie assistant
	if "assistant" in full_response:
	response = full_response.split("assistant")[-1].strip()
	else:
	# Retirer le prompt de l'entrée
	response = full_response[len(self.processor.decode(text_inputs[0], skip_special_tokens=True)):].strip()

	return response

	def _process_multimodal(self, text: str, image_b64: str, config: dict) -> str:
	"""
	Process text + image input
	"""
	# Décodage de l'image
	try:
	if image_b64.startswith('data:image'):
	# Retirer le header data:image/png;base64, si présent
	image_b64 = image_b64.split(',')[1]

	image_bytes = base64.b64decode(image_b64)
	image = Image.open(BytesIO(image_bytes)).convert("RGB")
	logger.info(f"Image loaded: {image.size}")

	except Exception as e:
	logger.error(f"Image decode error: {str(e)}")
	raise ValueError(f"Failed to decode image: {str(e)}")

	# Construction du message multimodal
	messages = [
	{"role": "user", "content": [
	{"type": "text", "text": text if text else "Analyse cette image."},
	{"type": "image"}
	]}
	]

	# Génération du prompt
	prompt = self.processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=False
	)

	# Traitement avec l'image
	inputs = self.processor(
	text=prompt,
	images=[image],
	return_tensors="pt"
	)

	# Déplacement vers le device
	inputs = {k: v.to(self.device) if hasattr(v, 'to') else v
	for k, v in inputs.items()}

	# Génération
	with torch.no_grad():
	outputs = self.model.generate(
	**inputs,
	max_new_tokens=config.get("max_new_tokens", 9192 * 10),
	temperature=config.get("temperature", 0.7),
	top_p=config.get("top_p", 0.9),
	do_sample=config.get("do_sample", True),
	repetition_penalty=config.get("repetition_penalty", 1.05),
	pad_token_id=self.processor.tokenizer.eos_token_id,
	eos_token_id=self.processor.tokenizer.eos_token_id
	)

	# Décodage
	full_response = self.processor.decode(outputs[0], skip_special_tokens=True)

	# Extraction de la réponse
	if "assistant" in full_response:
	response = full_response.split("assistant")[-1].strip()
	else:
	response = full_response.split(text)[-1].strip() if text in full_response else full_response

	return response

	def health(self) -> Dict[str, Any]:
	"""
	Health check endpoint for monitoring
	Returns system and model status
	"""
	health_status = {
	"status": "healthy",
	"model": {
	"name": "QwenStem-7b",
	"type": "Vision-Language Model",
	"loaded": hasattr(self, 'model') and self.model is not None,
	"device": str(self.device) if hasattr(self, 'device') else "unknown"
	},
	"system": {
	"torch_version": torch.__version__,
	"cuda_available": torch.cuda.is_available(),
	"gpu_count": torch.cuda.device_count() if torch.cuda.is_available() else 0
	}
	}

	# Informations GPU si disponible
	if torch.cuda.is_available() and hasattr(self, 'device') and self.device.type == 'cuda':
	try:
	gpu_props = torch.cuda.get_device_properties(0)
	health_status["gpu"] = {
	"name": gpu_props.name,
	"memory_total_gb": round(gpu_props.total_memory / (1024**3), 2),
	"memory_allocated_gb": round(torch.cuda.memory_allocated() / (1024**3), 2),
	"memory_reserved_gb": round(torch.cuda.memory_reserved() / (1024**3), 2),
	"utilization_percent": round(torch.cuda.memory_allocated() / gpu_props.total_memory * 100, 2)
	}
	except Exception as e:
	logger.warning(f"Could not get GPU stats: {e}")
	health_status["gpu"] = {"error": str(e)}

	# Test rapide du modèle si demandé
	if hasattr(self, 'model') and self.model is not None:
	try:
	# Test minimal pour vérifier que le modèle répond
	with torch.no_grad():
	test_input = self.processor.apply_chat_template(
	[{"role": "user", "content": [{"type": "text", "text": "test"}]}],
	tokenize=True,
	add_generation_prompt=True,
	return_tensors="pt"
	).to(self.device)

	# Génération très courte juste pour tester
	_ = self.model.generate(
	test_input,
	max_new_tokens=1,
	do_sample=False
	)
	health_status["model"]["responsive"] = True
	except Exception as e:
	logger.error(f"Model test failed: {e}")
	health_status["model"]["responsive"] = False
	health_status["model"]["error"] = str(e)
	health_status["status"] = "degraded"

	return health_status