Spaces:

beki
/

pii-anonymizer

Running

App Files Files Community

pii-anonymizer / app.py

beki

Update app.py

6db31f1 almost 2 years ago

raw

history blame

No virus

5.25 kB

	"""Streamlit app for Presidio."""

	import json
	from json import JSONEncoder
	from annotated_text import annotated_text
	import pandas as pd
	import streamlit as st
	from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
	from presidio_anonymizer import AnonymizerEngine

	from flair_recognizer import FlairRecognizer

	import spacy
	# spacy.cli.download("en_core_web_lg")

	# Helper methods
	@st.cache(allow_output_mutation=True)
	def analyzer_engine():
	"""Return AnalyzerEngine."""

	flair_recognizer = FlairRecognizer()

	registry = RecognizerRegistry()
	registry.add_recognizer(flair_recognizer)
	registry.load_predefined_recognizers()
	registry.remove_recognizer("SpacyRecognizer")

	analyzer = AnalyzerEngine(registry=registry)
	return analyzer


	@st.cache(allow_output_mutation=True)
	def anonymizer_engine():
	"""Return AnonymizerEngine."""
	return AnonymizerEngine()


	def get_supported_entities():
	"""Return supported entities from the Analyzer Engine."""
	return analyzer_engine().get_supported_entities()


	def analyze(**kwargs):
	"""Analyze input using Analyzer engine and input arguments (kwargs)."""
	if "entities" not in kwargs or "All" in kwargs["entities"]:
	kwargs["entities"] = None
	return analyzer_engine().analyze(**kwargs)


	def anonymize(text, analyze_results):
	"""Anonymize identified input using Presidio Abonymizer."""

	res = anonymizer_engine().anonymize(text, analyze_results)
	return res.text

	def annotate(text, st_analyze_results, st_entities):
	tokens = []
	# sort by start index
	results = sorted(st_analyze_results, key=lambda x: x.start)
	for i, res in enumerate(results):
	if i == 0:
	tokens.append(text[:res.start])

	# append entity text and entity type
	tokens.append((text[res.start: res.end], res.entity_type))

	# if another entity coming i.e. we're not at the last results element, add text up to next entity
	if i != len(results) - 1:
	tokens.append(text[res.end:results[i+1].start])
	# if no more entities coming, add all remaining text
	else:
	tokens.append(text[res.end:])
	return tokens

	st.set_page_config(page_title="Presidio demo (English)", layout="wide")

	# Side bar
	st.sidebar.markdown(
	"""
	Detect and anonymize PII entities in text with a [PII detection model](https://huggingface.co/beki/flair-pii-english) trained on protocol trace data generated by [privy](https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy) and [presidio](https://aka.ms/presidio).
	"""
	)

	st_entities = st.sidebar.multiselect(
	label="Which entities to look for?",
	options=get_supported_entities(),
	default=list(get_supported_entities()),
	)

	st_threshold = st.sidebar.slider(
	label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
	)

	st_return_decision_process = st.sidebar.checkbox("Add analysis explanations in json")

	st.sidebar.info(
	"Privy is an open source framework for synthetic data generation in protocol trace formats (json, sql, html etc). Presidio is an open source framework for PII detection and anonymization. "
	"For more info visit [aka.ms/presidio](https://aka.ms/presidio) and [privy](https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy)"
	)


	# Main panel
	analyzer_load_state = st.info("Starting Presidio analyzer and loading Privy-trained model...")
	engine = analyzer_engine()
	analyzer_load_state.empty()


	st_text = st.text_area(
	label="Type in some text",
	value=
	"{first_name: Willie Porter, ip_address: 192.168.2.80, email: willie@gmail.com}"
	"\n"
	"SELECT address FROM users WHERE address = '47 W 13th St, New York, NY 10011'",
	height=200,
	)

	# After
	st.subheader("Analyzed")
	with st.spinner("Analyzing..."):
	st_analyze_results = analyze(
	text=st_text,
	entities=st_entities,
	language="en",
	score_threshold=st_threshold,
	return_decision_process=st_return_decision_process,
	)
	annotated_tokens = annotate(st_text, st_analyze_results, st_entities)
	# annotated_tokens
	annotated_text(*annotated_tokens)
	# vertical space
	st.text("")

	st.subheader("Anonymized")

	with st.spinner("Anonymizing..."):
	st_anonymize_results = anonymize(st_text, st_analyze_results)
	st_anonymize_results


	# table result
	st.subheader("Detailed Findings")
	if st_analyze_results:
	res_dicts = [r.to_dict() for r in st_analyze_results]
	for d in res_dicts:
	d['Value'] = st_text[d['start']:d['end']]
	df = pd.DataFrame.from_records(res_dicts)
	df = df[["entity_type", "Value", "score", "start", "end"]].rename(
	{
	"entity_type": "Entity type",
	"start": "Start",
	"end": "End",
	"score": "Confidence",
	},
	axis=1,
	)

	st.dataframe(df, width=1000)
	else:
	st.text("No findings")


	# json result
	class ToDictListEncoder(JSONEncoder):
	"""Encode dict to json."""

	def default(self, o):
	"""Encode to JSON using to_dict."""
	if o:
	return o.to_dict()
	return []


	if st_return_decision_process:
	st.json(json.dumps(st_analyze_results, cls=ToDictListEncoder))