Spaces:

shreepanicker
/

phi-detection

Running

phi-detection / app.py

shree256

Refactor PHI detection logic to validate entity positions, remove overlaps, and improve redaction accuracy. Added error handling and enhanced output formatting for detected entities.

b9efb1f 4 days ago

raw

history blame contribute delete

7.14 kB

	import gradio as gr
	from transformers import pipeline

	# Load Stanford PHI detector model
	print("Loading Stanford PHI detector model...")
	phi_detector = pipeline(
	"token-classification",
	model="StanfordAIMI/stanford-deidentifier-base",
	aggregation_strategy="simple",
	device=-1, # CPU mode
	)
	print("Model loaded successfully!")


	def detect_and_redact_phi(text):
	"""
	Detect and redact PHI in text using Stanford's PHI detector

	Args:
	text: Input text to analyze

	Returns:
	Formatted string with redacted text and original text comparison
	"""
	if not text or not text.strip():
	return "⚠️ Please enter some text to analyze."

	try:
	# Get PHI predictions
	results = phi_detector(text)

	print(results)

	if not results:
	output = "## ✅ No PHI Detected\n\n"
	output += "Original Text:\n```\n"
	output += text
	output += "\n```\n\n"
	output += "Redacted Text:\n```\n"
	output += text
	output += "\n```\n"
	return output

	# Validate and clean entity results
	text_len = len(text)
	valid_entities = []

	for entity in results:
	start = entity.get("start", 0)
	end = entity.get("end", 0)

	# Validate positions are within bounds
	if start < 0 or end > text_len or start >= end:
	print(
	f"Warning: Invalid entity positions {start}-{end} for entity: {entity}"
	)
	continue

	# Extract entity text using position-based slicing (most reliable for redaction)
	entity_text = text[start:end]

	# Ensure we have valid text to redact (skip empty or whitespace-only entities)
	if not entity_text.strip():
	continue

	valid_entities.append(
	{
	"start": start,
	"end": end,
	"text": entity_text,
	"type": entity.get("entity_group", "UNKNOWN"),
	"score": entity.get("score", 0.0),
	}
	)

	if not valid_entities:
	output = "## ✅ No Valid PHI Detected\n\n"
	output += "Original Text:\n```\n"
	output += text
	output += "\n```\n\n"
	output += "Redacted Text:\n```\n"
	output += text
	output += "\n```\n"
	return output

	# Sort results by start position in reverse to replace from end to start
	# This prevents index shifting issues when replacing
	sorted_entities = sorted(valid_entities, key=lambda x: x["start"], reverse=True)

	# Remove overlapping entities (keep the first/longest one when sorted)
	non_overlapping = []
	used_ranges = []
	for entity in sorted(valid_entities, key=lambda x: (x["start"], -x["end"])):
	start, end = entity["start"], entity["end"]
	# Check for overlap with already processed entities
	overlaps = any(
	not (end <= used_start or start >= used_end)
	for used_start, used_end in used_ranges
	)
	if not overlaps:
	non_overlapping.append(entity)
	used_ranges.append((start, end))

	# Sort again in reverse for replacement
	sorted_entities = sorted(
	non_overlapping, key=lambda x: x["start"], reverse=True
	)
	redacted_text = text
	phi_details = []

	# Replace PHI entities with redaction markers
	# Since we're replacing from end to start, positions remain valid in original text
	for entity in sorted_entities:
	start = entity["start"]
	end = entity["end"]
	entity_text = entity["text"]
	phi_type = entity["type"]
	redaction_tag = f"[{phi_type}]"

	# Verify the entity text matches what's at this position in the current redacted text
	# For end-to-start replacement, earlier positions (larger start) have been modified,
	# so we check against the stored entity_text which was extracted from original text
	# We still validate the slice matches to catch any alignment issues
	try:
	# Store details for display
	phi_details.insert(
	0,
	{
	"text": entity_text,
	"type": phi_type,
	"confidence": entity["score"],
	"position": f"{start}-{end}",
	},
	)

	# Replace in redacted text using the original positions
	# Since we replace from end to start, positions remain valid
	redacted_text = (
	redacted_text[:start] + redaction_tag + redacted_text[end:]
	)
	except (IndexError, ValueError) as e:
	print(f"Warning: Error replacing entity at position {start}-{end}: {e}")
	print(
	f" Entity text: '{entity_text}', Redacted text length: {len(redacted_text)}"
	)

	# Format output
	output = "## 🔍 PHI Detection & Redaction Results\n\n"
	output += f"Found {len(phi_details)} PHI entity(ies):\n\n"

	for idx, detail in enumerate(phi_details, 1):
	output += f"{idx}. {detail['text']} → `{detail['type']}` "
	output += f"(Confidence: {detail['confidence']:.2%})\n"

	output += "\n---\n\n"
	output += "### 📄 Original Text\n```\n"
	output += text
	output += "\n```\n\n"
	output += "### 🔒 Redacted Text\n```\n"
	output += redacted_text
	output += "\n```\n"

	return output

	except Exception as e:
	import traceback

	error_details = traceback.format_exc()
	print(f"Error in detect_and_redact_phi: {error_details}")
	return f"❌ Error: {str(e)}"


	# Create Gradio interface
	demo = gr.Interface(
	fn=detect_and_redact_phi,
	inputs=gr.Textbox(
	label="Enter Text to Analyze",
	placeholder="Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024.",
	lines=8,
	),
	outputs=gr.Markdown(label="PHI Detection & Redaction Results"),
	title="🏥 Stanford PHI Detector & Redactor",
	description="Detect and redact Protected Health Information (PHI) using Stanford's de-identification model.",
	examples=[
	["Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024."],
	[
	"Jane Smith, DOB: 03/22/1980, Phone: (555) 123-4567, Address: 123 Main St, Boston, MA"
	],
	[
	"MRN: 98765432. Dr. Anderson saw the patient at Massachusetts General Hospital on December 15, 2024."
	],
	],
	theme="soft",
	)

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False, # Set to True for public link
	)