Spaces:

Negative-Star-Innovators
/

PII-Redaction-Playground

Runtime error

App Files Files Community

PII-Redaction-Playground / app.py

Negative-Star-Innovators

Update app.py

12737d7 verified 16 days ago

raw

history blame contribute delete

3.11 kB

	import gradio as gr
	from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

	# 1. Load the model from your Hugging Face repository
	# Replace this with your exact model ID!
	MODEL_ID = "Negative-Star-Innovators/MiniLM-L6-finetuned-pii-detection-v2"

	print("Loading model...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)

	# Initialize pipeline
	pii_pipeline = pipeline(
	"token-classification",
	model=model,
	tokenizer=tokenizer,
	aggregation_strategy="simple"
	)

	# 2. Define the redaction function
	def redact_pii(text):
	if not text.strip():
	return ""

	# Run the model on the input text
	results = pii_pipeline(text)

	# If no PII is found, return original text
	if not results:
	return text

	# Sort results in reverse order based on their start index.
	# Why? If we replace text from left to right, the string length changes
	# and messes up the start/end indexes for the remaining entities.
	# Going backwards prevents this!
	results_sorted = sorted(results, key=lambda x: x['start'], reverse=True)

	redacted_text = text
	for entity in results_sorted:
	start = entity['start']
	end = entity['end']
	label = entity['entity_group']

	# Replace the sensitive text with a clean [REDACTED LABEL] tag
	replacement = f"[REDACTED {label.upper()}]"
	redacted_text = redacted_text[:start] + replacement + redacted_text[end:]

	return redacted_text

	# 3. Build the Gradio User Interface
	# We define the input box, the output box, and some default examples.
	demo = gr.Interface(
	fn=redact_pii,
	inputs=gr.Textbox(
	lines=5,
	label="Input Text",
	placeholder="Paste text containing sensitive data (names, emails, routing numbers) here..."
	),
	outputs=gr.Textbox(
	lines=5,
	label="Redacted Output"
	),
	title="🛡️ Secure PII Redaction Playground",
	description=(
	"Test our highly efficient (90MB) PII detection model that is capable of running locally on your device. "
	"It quickly scrubs Personally Identifiable Information entirely on CPU, making it perfect "
	"for sanitizing data before sending it to third-party cloud LLMs and other parties."
	),
	article = (
	"📧 Please reach out if you have a question or feedback. We also do custom projects, consultating, freelance and collaboration: [thieves@negativestarinnovators.com](mailto:thieves@negativestarinnovators.com)"
	),
	examples=[
	["John Doe's routing number is 123456789 and his email is john.doe@email.com."],
	["Please update the shipping address for Jane Smith to 123 Secure Lane. Her phone number is 555-0198."],
	["The patient, Michael Johnson, was born on 10/12/1985. His SSN is 000-11-2222."]
	],
	flagging_mode="never" # Turns off the "Flag" button since we don't need to collect user data
	)

	# 4. Launch the app
	if __name__ == "__main__":
	demo.launch()