handler.py · ETI-Deploy/DM-BaseModel-4Bit at main

File size: 4,569 Bytes

94152ca

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline


class EndpointHandler:
    def __init__(self, model_path="djangodevloper/llama3-70b-4bit-medqa"):
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
            self.model = AutoModelForCausalLM.from_pretrained(
                model_path,
                device_map="auto",
                torch_dtype=torch.bfloat16,
                trust_remote_code=True
            )
            self.pipeline = TextGenerationPipeline(
                model=self.model,
                tokenizer=self.tokenizer,
            )
        except Exception as e:
            raise RuntimeError(f"Failed to initialize model or tokenizer: {e}")

        # PROMPT FOR GENERAL USERS
        self.general_prompt = (
            "You are DoctusMind, a trustworthy and friendly medical AI assistant. "
            "Provide clear, easy-to-understand, and medically accurate answers to everyday health questions. "
            "Use simple language and suggest safe, evidence-informed home remedies when suitable. "
            "Be supportive and avoid technical jargon. Prioritize safety and clarity. "
            "If asked a non-medical question, politely respond with:\n"
            "`{\"not_medical_question\": true}`\n"
            "Format responses with bullet points, headers, or short paragraphs when helpful."
        )

        # PROMPT FOR PROFESSIONAL USERS
        self.professional_prompt = (
            "You are DoctusMind, a highly competent and articulate medical AI assistant for healthcare professionals. "
            "Provide concise, medically rigorous responses using appropriate clinical terminology, diagnostic language, "
            "and pathophysiological reasoning. Reference guidelines (e.g., WHO, CDC, NICE) where relevant. "
            "Always maintain a professional tone and format responses for quick clinical comprehension. "
            "If asked a non-medical question, reply with:\n"
            "`{\"not_medical_question\": true}`"
        )

        # PROMPT FOR CONVERSATION SUMMARY
        self.summary_prompt = (
            "Update the user’s running chat summary by incorporating the most recent messages. "
            "Preserve important context like health conditions, preferences, personal facts, "
            "or constraints. Keep the summary compact and in User: ...\\nBot: ... format. "
            "Omit small talk unless relevant."
        )

        # PROMPT FOR CONVERSATION HEADER
        self.header_prompt = (
            "Generate a short and meaningful header (max 50 characters) based on the conversation."
        )

    def __call__(self, data):
        try:
            user_input = data.get("inputs", "")
            user_type = data.get("user_type", "general").strip().lower()
            mode = data.get("mode", "chat").strip().lower()

            if not user_input:
                return {"error": "Missing 'inputs' in request."}

            # Pick system prompt
            if mode == "summary":
                system_prompt = self.summary_prompt
            elif mode == "header":
                system_prompt = self.header_prompt
            else:
                system_prompt = self.professional_prompt if user_type == "professional" else self.general_prompt

            # Compose prompt (remove unnecessary newlines)
            full_prompt = f"<|system|>{system_prompt}<|user|>{user_input}<|assistant|>"

            # Generate
            outputs = self.pipeline(
                full_prompt,
                max_new_tokens=600,  # Reduced to 600 for latency, still enough for full answers
                temperature=0.1,  # Low = faster and focused
                top_k=50,
                top_p=0.9,
                repetition_penalty=1.05,
                do_sample=False,  # Deterministic = less decoding time
                eos_token_id=[
                    self.tokenizer.eos_token_id,
                    self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
                ]
            )

            # Extract
            generated_text = outputs[0]["generated_text"]
            response = generated_text.split("<|assistant|>")[-1].strip()

            # Fallback if empty
            if not response:
                response = "Sorry, I couldn't generate a complete response. Try rephrasing."

            return {"generated_text": response}

        except Exception as e:
            return {"error": f"Inference error: {str(e)}"}