Spaces:

BinKhoaLe1812
/

Medical-Chatbot

Sleeping

App Files Files Community

Medical-Chatbot / models /guard.py

BinKhoaLe1812

Upd guardrail

4364204 verified 2 months ago

raw

history blame contribute delete

21.7 kB

	import os
	import re
	import requests
	import logging
	from typing import Tuple, List, Dict


	logger = logging.getLogger(__name__)


	class SafetyGuard:
	"""
	Wrapper around NVIDIA Llama Guard (meta/llama-guard-4-12b) hosted at
	https://integrate.api.nvidia.com/v1/chat/completions

	Exposes helpers to validate:
	- user input safety
	- model output safety (in context of the user question)
	"""

	def __init__(self):
	self.api_key = os.getenv("NVIDIA_URI")
	if not self.api_key:
	raise ValueError("NVIDIA_URI environment variable not set for SafetyGuard")
	self.base_url = "https://integrate.api.nvidia.com/v1/chat/completions"
	self.model = "meta/llama-guard-4-12b"
	self.timeout_s = 30

	@staticmethod
	def _chunk_text(text: str, chunk_size: int = 2800, overlap: int = 200) -> List[str]:
	"""Chunk long text to keep request payloads small enough for the guard.
	Uses character-based approximation with small overlap.
	"""
	if not text:
	return [""]
	n = len(text)
	if n <= chunk_size:
	return [text]
	chunks: List[str] = []
	start = 0
	while start < n:
	end = min(start + chunk_size, n)
	chunks.append(text[start:end])
	if end == n:
	break
	start = max(0, end - overlap)
	return chunks

	def _call_guard(self, messages: List[Dict], max_tokens: int = 512) -> str:
	# Enhance messages with medical context if detected
	enhanced_messages = self._enhance_messages_with_context(messages)

	headers = {
	"Authorization": f"Bearer {self.api_key}",
	"Content-Type": "application/json",
	"Accept": "application/json",
	}
	# Try OpenAI-compatible schema first
	payload_chat = {
	"model": self.model,
	"messages": enhanced_messages,
	"temperature": 0.2,
	"top_p": 0.7,
	"max_tokens": max_tokens,
	"stream": False,
	}
	# Alternative schema (some NVIDIA deployments require message content objects)
	alt_messages = []
	for m in enhanced_messages:
	content = m.get("content", "")
	if isinstance(content, str):
	content = [{"type": "text", "text": content}]
	alt_messages.append({"role": m.get("role", "user"), "content": content})
	payload_alt = {
	"model": self.model,
	"messages": alt_messages,
	"temperature": 0.2,
	"top_p": 0.7,
	"max_tokens": max_tokens,
	"stream": False,
	}
	# Attempt primary, then fallback
	for payload in (payload_chat, payload_alt):
	try:
	resp = requests.post(self.base_url, headers=headers, json=payload, timeout=self.timeout_s)
	if resp.status_code >= 400:
	# Log server message for debugging payload issues
	try:
	logger.error(f"[SafetyGuard] HTTP {resp.status_code}: {resp.text[:400]}")
	except Exception:
	pass
	resp.raise_for_status()
	data = resp.json()
	content = (
	data.get("choices", [{}])[0]
	.get("message", {})
	.get("content", "")
	.strip()
	)
	if content:
	return content
	except Exception as e:
	# Try next payload shape
	logger.error(f"[SafetyGuard] Guard API call failed: {e}")
	continue
	# All attempts failed
	return ""

	@staticmethod
	def _parse_guard_reply(text: str) -> Tuple[bool, str]:
	"""Parse guard reply; expect 'SAFE' or 'UNSAFE: <reason>' (case-insensitive)."""
	if not text:
	# Fail-open: treat as SAFE if guard unavailable to avoid false blocks
	return True, "guard_unavailable"
	t = text.strip()
	upper = t.upper()
	if upper.startswith("SAFE") and not upper.startswith("SAFEGUARD"):
	return True, ""
	if upper.startswith("UNSAFE"):
	# Extract reason after the first colon if present
	parts = t.split(":", 1)
	reason = parts[1].strip() if len(parts) > 1 else "policy violation"
	return False, reason
	# Fallback: treat unknown response as unsafe
	return False, t[:180]

	def _is_medical_query(self, query: str) -> bool:
	"""Check if query is clearly medical in nature using comprehensive patterns."""
	if not query:
	return False

	query_lower = query.lower()

	# Medical keyword categories
	medical_categories = {
	'symptoms': [
	'symptom', 'pain', 'ache', 'hurt', 'sore', 'tender', 'stiff', 'numb',
	'headache', 'migraine', 'fever', 'cough', 'cold', 'flu', 'sneeze',
	'nausea', 'vomit', 'diarrhea', 'constipation', 'bloating', 'gas',
	'dizziness', 'vertigo', 'fatigue', 'weakness', 'tired', 'exhausted',
	'shortness of breath', 'wheezing', 'chest pain', 'heart palpitations',
	'joint pain', 'muscle pain', 'back pain', 'neck pain', 'stomach pain',
	'abdominal pain', 'pelvic pain', 'menstrual pain', 'cramps'
	],
	'conditions': [
	'disease', 'condition', 'disorder', 'syndrome', 'illness', 'sickness',
	'infection', 'inflammation', 'allergy', 'asthma', 'diabetes', 'hypertension',
	'depression', 'anxiety', 'stress', 'panic', 'phobia', 'ocd', 'ptsd',
	'adhd', 'autism', 'dementia', 'alzheimer', 'parkinson', 'epilepsy',
	'cancer', 'tumor', 'cancerous', 'malignant', 'benign', 'metastasis',
	'heart disease', 'stroke', 'heart attack', 'coronary', 'arrhythmia',
	'pneumonia', 'bronchitis', 'copd', 'emphysema', 'tuberculosis',
	'migraine', 'headache', 'chronic migraine', 'cluster headache',
	'tension headache', 'sinus headache', 'cure', 'treat', 'treatment'
	],
	'treatments': [
	'treatment', 'therapy', 'medication', 'medicine', 'drug', 'pill', 'tablet',
	'injection', 'vaccine', 'immunization', 'surgery', 'operation', 'procedure',
	'chemotherapy', 'radiation', 'physical therapy', 'occupational therapy',
	'psychotherapy', 'counseling', 'rehabilitation', 'recovery', 'healing',
	'prescription', 'dosage', 'side effects', 'contraindications'
	],
	'body_parts': [
	'head', 'brain', 'eye', 'ear', 'nose', 'mouth', 'throat', 'neck',
	'chest', 'heart', 'lung', 'liver', 'kidney', 'stomach', 'intestine',
	'back', 'spine', 'joint', 'muscle', 'bone', 'skin', 'hair', 'nail',
	'arm', 'leg', 'hand', 'foot', 'finger', 'toe', 'pelvis', 'genital'
	],
	'medical_context': [
	'doctor', 'physician', 'nurse', 'specialist', 'surgeon', 'dentist',
	'medical', 'health', 'healthcare', 'hospital', 'clinic', 'emergency',
	'ambulance', 'paramedic', 'pharmacy', 'pharmacist', 'lab', 'test',
	'diagnosis', 'prognosis', 'examination', 'checkup', 'screening',
	'patient', 'case', 'history', 'medical history', 'family history'
	],
	'life_stages': [
	'pregnancy', 'pregnant', 'baby', 'infant', 'newborn', 'child', 'pediatric',
	'teenager', 'adolescent', 'adult', 'elderly', 'senior', 'geriatric',
	'menopause', 'puberty', 'aging', 'birth', 'delivery', 'miscarriage'
	],
	'vital_signs': [
	'blood pressure', 'heart rate', 'pulse', 'temperature', 'fever',
	'respiratory rate', 'oxygen saturation', 'weight', 'height', 'bmi',
	'blood sugar', 'glucose', 'cholesterol', 'hemoglobin', 'white blood cell'
	]
	}

	# Check for medical keywords
	for category, keywords in medical_categories.items():
	if any(keyword in query_lower for keyword in keywords):
	return True

	# Check for medical question patterns
	medical_patterns = [
	r'\b(what\|how\|why\|when\|where)\s+(causes?\|treats?\|prevents?\|symptoms?\|signs?)\b',
	r'\b(is\|are)\s+(.*?)\s+(dangerous\|serious\|harmful\|safe\|normal)\b',
	r'\b(should\|can\|may\|might)\s+(i\|you\|we)\s+(take\|use\|do\|avoid)\b',
	r'\b(diagnosis\|diagnosed\|symptoms\|treatment\|medicine\|drug)\b',
	r'\b(medical\|health\|doctor\|physician\|hospital\|clinic)\b',
	r'\b(pain\|hurt\|ache\|sore\|fever\|cough\|headache)\b',
	r'\b(which\s+medication\|best\s+medication\|how\s+to\s+cure\|without\s+medications)\b',
	r'\b(chronic\s+migraine\|migraine\s+treatment\|migraine\s+cure)\b',
	r'\b(cure\|treat\|heal\|relief\|remedy\|solution)\b'
	]

	for pattern in medical_patterns:
	if re.search(pattern, query_lower):
	return True

	return False

	def check_user_query(self, user_query: str) -> Tuple[bool, str]:
	"""Validate the user query is safe to process with medical context awareness."""
	text = user_query or ""

	# For medical queries, be more permissive
	if self._is_medical_query(text):
	logger.info("[SafetyGuard] Medical query detected, skipping strict validation")
	return True, "medical_query"

	# If too long, validate each chunk; any UNSAFE makes overall UNSAFE
	for part in self._chunk_text(text):
	messages = [{"role": "user", "content": part}]
	reply = self._call_guard(messages, max_tokens=64)
	ok, reason = self._parse_guard_reply(reply)
	if not ok:
	return False, reason
	return True, ""

	def _detect_harmful_content(self, text: str) -> Tuple[bool, str]:
	"""Detect harmful content using sophisticated pattern matching."""
	if not text:
	return True, ""

	text_lower = text.lower()

	# First check if this is clearly medical content - be more permissive
	if self._is_medical_query(text):
	# For medical content, only check for truly dangerous patterns
	dangerous_medical_patterns = {
	'suicide_self_harm': [
	r'\b(kill\s+yourself\|suicide\|end\s+your\s+life\|take\s+your\s+life)\b',
	r'\b(self\sharm\|self\sinjury\|cut\s+yourself\|hurt\s+yourself)\b',
	r'\b(overdose\|poison\s+yourself\|hang\s+yourself)\b'
	],
	'dangerous_medical_advice': [
	r'\b(overdose\s+on\|take\s+too\s+much\|excessive\s+dosage)\b',
	r'\b(mix\s+drugs\|combine\s+medications\|contraindicated)\b',
	r'\b(stop\s+taking\s+prescribed\|discontinue\s+medication)\b',
	r'\b(don\'t\s+call\s+911\|avoid\s+emergency\s+room\|ignore\s+severe\s+symptoms)\b'
	],
	'illegal_drugs': [
	r'\b(cocaine\|heroin\|methamphetamine\|ecstasy\|lsd\|marijuana\s+abuse)\b',
	r'\b(prescription\s+abuse\|drug\s+dealing\|illegal\s+substances)\b'
	]
	}

	# Check only dangerous medical patterns
	for category, patterns in dangerous_medical_patterns.items():
	for pattern in patterns:
	if re.search(pattern, text_lower):
	return False, f"contains {category}: {pattern}"

	# Allow medical content through
	return True, "medical_content"

	# For non-medical content, use stricter patterns
	critical_patterns = {
	'suicide_self_harm': [
	r'\b(kill\s+yourself\|suicide\|end\s+your\s+life\|take\s+your\s+life)\b',
	r'\b(self\sharm\|self\sinjury\|cut\s+yourself\|hurt\s+yourself)\b',
	r'\b(overdose\|poison\s+yourself\|hang\s+yourself)\b'
	],
	'violence': [
	r'\b(kill\s+someone\|murder\|assassinate\|violence\|harm\s+others)\b',
	r'\b(bomb\|explosive\|weapon\|gun\|knife)\b',
	r'\b(attack\s+(someone\|people\|others\|innocent))\b' # More specific attack pattern
	],
	'illegal_drugs': [
	r'\b(cocaine\|heroin\|methamphetamine\|ecstasy\|lsd\|marijuana\s+abuse)\b',
	r'\b(prescription\s+abuse\|drug\s+dealing\|illegal\s+substances)\b'
	],
	'dangerous_medical': [
	r'\b(overdose\s+on\|take\s+too\s+much\|excessive\s+dosage)\b',
	r'\b(mix\s+drugs\|combine\s+medications\|contraindicated)\b',
	r'\b(stop\s+taking\s+prescribed\|discontinue\s+medication)\b'
	]
	}

	# Check critical patterns
	for category, patterns in critical_patterns.items():
	for pattern in patterns:
	if re.search(pattern, text_lower):
	return False, f"contains {category}: {pattern}"

	# Context-aware medical safety checks
	medical_safety_patterns = {
	'dosage_warnings': [
	r'\b(take\s+more\s+than\s+prescribed\|exceed\s+recommended\s+dose)\b',
	r'\b(double\s+up\s+on\s+medication\|take\s+extra\s+pills)\b'
	],
	'emergency_advice': [
	r'\b(don\'t\s+call\s+911\|avoid\s+emergency\s+room\|ignore\s+severe\s+symptoms)\b',
	r'\b(self\s*treat\s+emergency\|handle\s+at\s+home\s+when\s+critical)\b'
	]
	}

	# Check medical safety patterns
	for category, patterns in medical_safety_patterns.items():
	for pattern in patterns:
	if re.search(pattern, text_lower):
	return False, f"contains dangerous medical advice: {pattern}"

	return True, ""

	def _enhance_messages_with_context(self, messages: List[Dict]) -> List[Dict]:
	"""Enhance messages with medical context awareness for better guard performance."""
	if not messages:
	return messages

	# Check if any message contains medical content
	has_medical_content = any(
	self._is_medical_query(msg.get("content", "")) for msg in messages
	)

	if has_medical_content:
	# Add medical context to the system prompt
	enhanced_messages = []
	for i, msg in enumerate(messages):
	if i == 0 and msg.get("role") == "user":
	# Add medical context to first user message
	original_content = msg.get("content", "")
	enhanced_content = f"""MEDICAL CONTEXT: This is a medical consultation. The user is asking about health-related topics.

	ORIGINAL MESSAGE: {original_content}

	Please evaluate this message for safety, keeping in mind that:
	- Medical questions and health concerns are legitimate
	- General medical information sharing is acceptable
	- Only block content that is clearly harmful, dangerous, or illegal
	- Medical advice should be educational, not prescriptive"""

	enhanced_messages.append({
	"role": msg["role"],
	"content": enhanced_content
	})
	else:
	enhanced_messages.append(msg)
	return enhanced_messages

	return messages

	def _assess_risk_level(self, text: str) -> Tuple[str, float]:
	"""Assess the risk level of content using multiple indicators."""
	if not text:
	return "low", 0.0

	text_lower = text.lower()

	# If this is medical content, be more lenient
	if self._is_medical_query(text):
	# For medical content, only flag truly dangerous patterns
	dangerous_medical_indicators = {
	'high': [
	'suicide', 'kill yourself', 'end your life', 'self harm',
	'overdose', 'poison yourself', 'illegal drugs', 'violence'
	],
	'medium': [
	'prescription abuse', 'excessive dosage', 'mix drugs',
	'stop taking prescribed', 'ignore severe symptoms'
	]
	}

	risk_score = 0.0
	for level, indicators in dangerous_medical_indicators.items():
	for indicator in indicators:
	if indicator in text_lower:
	if level == 'high':
	risk_score += 3.0
	elif level == 'medium':
	risk_score += 1.5

	# Normalize score for medical content (more lenient)
	risk_score = min(risk_score / 15.0, 1.0)

	if risk_score >= 0.6:
	return "high", risk_score
	elif risk_score >= 0.2:
	return "medium", risk_score
	else:
	return "low", risk_score

	# For non-medical content, use original risk assessment
	risk_indicators = {
	'high': [
	'suicide', 'kill yourself', 'end your life', 'self harm',
	'overdose', 'poison', 'illegal drugs', 'violence', 'harm others'
	],
	'medium': [
	'prescription abuse', 'excessive dosage', 'mix drugs',
	'stop medication', 'ignore symptoms', 'avoid doctor'
	],
	'low': [
	'pain', 'headache', 'fever', 'cough', 'treatment',
	'medicine', 'doctor', 'hospital', 'symptoms'
	]
	}

	risk_score = 0.0
	for level, indicators in risk_indicators.items():
	for indicator in indicators:
	if indicator in text_lower:
	if level == 'high':
	risk_score += 3.0
	elif level == 'medium':
	risk_score += 1.5
	else:
	risk_score += 0.5

	# Normalize score
	risk_score = min(risk_score / 10.0, 1.0)

	if risk_score >= 0.7:
	return "high", risk_score
	elif risk_score >= 0.3:
	return "medium", risk_score
	else:
	return "low", risk_score

	def check_model_answer(self, user_query: str, model_answer: str) -> Tuple[bool, str]:
	"""Validate the model's answer is safe with medical context awareness."""
	uq = user_query or ""
	ans = model_answer or ""

	# Assess risk level first
	risk_level, risk_score = self._assess_risk_level(ans)
	logger.info(f"[SafetyGuard] Risk assessment: {risk_level} (score: {risk_score:.2f})")

	# Always check for harmful content first
	is_safe, reason = self._detect_harmful_content(ans)
	if not is_safe:
	return False, reason

	# For high-risk content, always use strict validation
	if risk_level == "high":
	logger.warning("[SafetyGuard] High-risk content detected, using strict validation")
	user_parts = self._chunk_text(uq, chunk_size=2000)
	user_context = user_parts[0] if user_parts else ""
	for ans_part in self._chunk_text(ans):
	messages = [
	{"role": "user", "content": user_context},
	{"role": "assistant", "content": ans_part},
	]
	reply = self._call_guard(messages, max_tokens=96)
	ok, reason = self._parse_guard_reply(reply)
	if not ok:
	return False, reason
	return True, "high_risk_validated"

	# For medical queries and answers, use relaxed validation
	if self._is_medical_query(uq) or self._is_medical_query(ans):
	logger.info("[SafetyGuard] Medical content detected, using relaxed validation")
	return True, "medical_content"

	# For medium-risk non-medical content, use guard validation
	if risk_level == "medium":
	logger.info("[SafetyGuard] Medium-risk content detected, using guard validation")
	user_parts = self._chunk_text(uq, chunk_size=2000)
	user_context = user_parts[0] if user_parts else ""
	for ans_part in self._chunk_text(ans):
	messages = [
	{"role": "user", "content": user_context},
	{"role": "assistant", "content": ans_part},
	]
	reply = self._call_guard(messages, max_tokens=96)
	ok, reason = self._parse_guard_reply(reply)
	if not ok:
	return False, reason
	return True, "medium_risk_validated"

	# For low-risk content, allow through
	logger.info("[SafetyGuard] Low-risk content detected, allowing through")
	return True, "low_risk"


	# Global instance (optional convenience)
	safety_guard = SafetyGuard()