phi-detection / app.py
shree256
Refactor PHI detection logic to validate entity positions, remove overlaps, and improve redaction accuracy. Added error handling and enhanced output formatting for detected entities.
b9efb1f
import gradio as gr
from transformers import pipeline
# Load Stanford PHI detector model
print("Loading Stanford PHI detector model...")
phi_detector = pipeline(
"token-classification",
model="StanfordAIMI/stanford-deidentifier-base",
aggregation_strategy="simple",
device=-1, # CPU mode
)
print("Model loaded successfully!")
def detect_and_redact_phi(text):
"""
Detect and redact PHI in text using Stanford's PHI detector
Args:
text: Input text to analyze
Returns:
Formatted string with redacted text and original text comparison
"""
if not text or not text.strip():
return "⚠️ Please enter some text to analyze."
try:
# Get PHI predictions
results = phi_detector(text)
print(results)
if not results:
output = "## βœ… No PHI Detected\n\n"
output += "**Original Text:**\n```\n"
output += text
output += "\n```\n\n"
output += "**Redacted Text:**\n```\n"
output += text
output += "\n```\n"
return output
# Validate and clean entity results
text_len = len(text)
valid_entities = []
for entity in results:
start = entity.get("start", 0)
end = entity.get("end", 0)
# Validate positions are within bounds
if start < 0 or end > text_len or start >= end:
print(
f"Warning: Invalid entity positions {start}-{end} for entity: {entity}"
)
continue
# Extract entity text using position-based slicing (most reliable for redaction)
entity_text = text[start:end]
# Ensure we have valid text to redact (skip empty or whitespace-only entities)
if not entity_text.strip():
continue
valid_entities.append(
{
"start": start,
"end": end,
"text": entity_text,
"type": entity.get("entity_group", "UNKNOWN"),
"score": entity.get("score", 0.0),
}
)
if not valid_entities:
output = "## βœ… No Valid PHI Detected\n\n"
output += "**Original Text:**\n```\n"
output += text
output += "\n```\n\n"
output += "**Redacted Text:**\n```\n"
output += text
output += "\n```\n"
return output
# Sort results by start position in reverse to replace from end to start
# This prevents index shifting issues when replacing
sorted_entities = sorted(valid_entities, key=lambda x: x["start"], reverse=True)
# Remove overlapping entities (keep the first/longest one when sorted)
non_overlapping = []
used_ranges = []
for entity in sorted(valid_entities, key=lambda x: (x["start"], -x["end"])):
start, end = entity["start"], entity["end"]
# Check for overlap with already processed entities
overlaps = any(
not (end <= used_start or start >= used_end)
for used_start, used_end in used_ranges
)
if not overlaps:
non_overlapping.append(entity)
used_ranges.append((start, end))
# Sort again in reverse for replacement
sorted_entities = sorted(
non_overlapping, key=lambda x: x["start"], reverse=True
)
redacted_text = text
phi_details = []
# Replace PHI entities with redaction markers
# Since we're replacing from end to start, positions remain valid in original text
for entity in sorted_entities:
start = entity["start"]
end = entity["end"]
entity_text = entity["text"]
phi_type = entity["type"]
redaction_tag = f"[{phi_type}]"
# Verify the entity text matches what's at this position in the current redacted text
# For end-to-start replacement, earlier positions (larger start) have been modified,
# so we check against the stored entity_text which was extracted from original text
# We still validate the slice matches to catch any alignment issues
try:
# Store details for display
phi_details.insert(
0,
{
"text": entity_text,
"type": phi_type,
"confidence": entity["score"],
"position": f"{start}-{end}",
},
)
# Replace in redacted text using the original positions
# Since we replace from end to start, positions remain valid
redacted_text = (
redacted_text[:start] + redaction_tag + redacted_text[end:]
)
except (IndexError, ValueError) as e:
print(f"Warning: Error replacing entity at position {start}-{end}: {e}")
print(
f" Entity text: '{entity_text}', Redacted text length: {len(redacted_text)}"
)
# Format output
output = "## πŸ” PHI Detection & Redaction Results\n\n"
output += f"**Found {len(phi_details)} PHI entity(ies):**\n\n"
for idx, detail in enumerate(phi_details, 1):
output += f"{idx}. **{detail['text']}** β†’ `{detail['type']}` "
output += f"(Confidence: {detail['confidence']:.2%})\n"
output += "\n---\n\n"
output += "### πŸ“„ Original Text\n```\n"
output += text
output += "\n```\n\n"
output += "### πŸ”’ Redacted Text\n```\n"
output += redacted_text
output += "\n```\n"
return output
except Exception as e:
import traceback
error_details = traceback.format_exc()
print(f"Error in detect_and_redact_phi: {error_details}")
return f"❌ **Error:** {str(e)}"
# Create Gradio interface
demo = gr.Interface(
fn=detect_and_redact_phi,
inputs=gr.Textbox(
label="Enter Text to Analyze",
placeholder="Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024.",
lines=8,
),
outputs=gr.Markdown(label="PHI Detection & Redaction Results"),
title="πŸ₯ Stanford PHI Detector & Redactor",
description="Detect and redact Protected Health Information (PHI) using Stanford's de-identification model.",
examples=[
["Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024."],
[
"Jane Smith, DOB: 03/22/1980, Phone: (555) 123-4567, Address: 123 Main St, Boston, MA"
],
[
"MRN: 98765432. Dr. Anderson saw the patient at Massachusetts General Hospital on December 15, 2024."
],
],
theme="soft",
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False, # Set to True for public link
)