Spaces:

langdonholmes
/

piilo

Sleeping

App Files Files Community

piilo / app.py

langdonholmes

add logo to sidebar

e7e2ae9 over 1 year ago

raw

history blame

No virus

5 kB


	'''Streamlit app for Student Name Detection models.'''
	import json
	import os
	import warnings
	from json import JSONEncoder

	import pandas as pd
	import streamlit as st
	from annotated_text import annotated_text

	from piilo.engines.analyzer import CustomAnalyzer
	from piilo.engines.anonymizer import SurrogateAnonymizer

	os.environ['TOKENIZERS_PARALLELISM'] = 'false'
	warnings.filterwarnings('ignore')

	# Helper methods
	@st.cache(allow_output_mutation=True)
	def analyzer_engine():
	'''Return AnalyzerEngine and cache with Streamlit.'''

	configuration = {
	'nlp_engine_name': 'spacy',
	'models': [
	{'lang_code': 'en', 'model_name': 'en_student_name_detector'}],
	}

	return CustomAnalyzer(configuration=configuration)

	@st.cache(allow_output_mutation=True)
	def anonymizer_engine():
	'''Return generate surrogate anonymizer.'''
	return SurrogateAnonymizer()

	def annotate(text, st_analyze_results, st_entities):
	tokens = []
	# sort by start index
	results = sorted(st_analyze_results, key=lambda x: x.start)
	for i, res in enumerate(results):
	if i == 0:
	tokens.append(text[:res.start])

	# append entity text and entity type
	tokens.append((text[res.start: res.end], res.entity_type))

	# if another entity coming i.e. we're not at the last results element, add text up to next entity
	if i != len(results) - 1:
	tokens.append(text[res.end:results[i+1].start])
	# if no more entities coming, add all remaining text
	else:
	tokens.append(text[res.end:])
	return tokens


	st.set_page_config(page_title='Student Name Detector (English)', layout='wide')

	# Side bar
	st.sidebar.image('logo.png')

	st.sidebar.markdown(
	'''Detect and anonymize PII in text using an [NLP model](https://huggingface.co/langdonholmes/en_student_name_detector) [trained](https://github.com/aialoe/deidentification-pipeline) on student-generated text collected from a massive online open-enrollment course.
	'''
	)

	st_entities = st.sidebar.multiselect(
	label='Which entities to look for?',
	options=analyzer_engine().get_supported_entities(),
	default=list(analyzer_engine().get_supported_entities()),
	)

	st_threshold = st.sidebar.slider(
	label='Acceptance threshold', min_value=0.0, max_value=1.0, value=0.35
	)

	st_return_decision_process = st.sidebar.checkbox(
	'Add analysis explanations in json')

	st.sidebar.info(
	'This is part of a project to develop new anonymization systems that are appropriate for student-generated text.'
	)

	# Main panel
	analyzer_load_state = st.info(
	'Starting Presidio analyzer and loading Longformer-based model...')
	engine = analyzer_engine()
	analyzer_load_state.empty()


	st_text = st.text_area(
	label='Type in some text',
	value='Learning Reflection\n\nWritten by John Williams and Samantha Morales\n\nIn this course I learned many things. As Liedtke (2004) said, \"Students grow when they learn\" (Erickson et al. 1998).\n\nBy John H. Williams -- (714) 328-9989 -- johnwilliams@yahoo.com',
	height=200,
	)

	button = st.button('Detect PII')

	if 'first_load' not in st.session_state:
	st.session_state['first_load'] = True

	# After
	st.subheader('Analyzed')
	with st.spinner('Analyzing...'):
	if button or st.session_state.first_load:
	st_analyze_results = analyzer_engine().analyze(
	text=st_text,
	entities=st_entities,
	language='en',
	score_threshold=st_threshold,
	return_decision_process=st_return_decision_process,
	)
	annotated_tokens = annotate(st_text, st_analyze_results, st_entities)
	# annotated_tokens
	annotated_text(*annotated_tokens)

	# vertical space
	st.text('')

	st.subheader('Anonymized')
	with st.spinner('Anonymizing...'):
	if button or st.session_state.first_load:
	st_anonymize_results = anonymizer_engine().anonymize(
	st_text,
	st_analyze_results)
	st_anonymize_results

	# table result
	st.subheader('Detailed Findings')
	if st_analyze_results:
	res_dicts = [r.to_dict() for r in st_analyze_results]
	for d in res_dicts:
	d['Value'] = st_text[d['start']:d['end']]
	df = pd.DataFrame.from_records(res_dicts)
	df = df[['entity_type', 'Value', 'score', 'start', 'end']].rename(
	{
	'entity_type': 'Entity type',
	'start': 'Start',
	'end': 'End',
	'score': 'Confidence',
	},
	axis=1,
	)

	st.dataframe(df, width=1000)
	else:
	st.text('No findings')

	st.session_state['first_load'] = True

	# json result
	class ToDictListEncoder(JSONEncoder):
	'''Encode dict to json.'''

	def default(self, o):
	'''Encode to JSON using to_dict.'''
	if o:
	return o.to_dict()
	return []

	if st_return_decision_process:
	st.json(json.dumps(st_analyze_results, cls=ToDictListEncoder))