piilo / app.py
langdonholmes
add logo to sidebar
e7e2ae9
raw
history blame
No virus
5 kB
'''Streamlit app for Student Name Detection models.'''
import json
import os
import warnings
from json import JSONEncoder
import pandas as pd
import streamlit as st
from annotated_text import annotated_text
from piilo.engines.analyzer import CustomAnalyzer
from piilo.engines.anonymizer import SurrogateAnonymizer
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
warnings.filterwarnings('ignore')
# Helper methods
@st.cache(allow_output_mutation=True)
def analyzer_engine():
'''Return AnalyzerEngine and cache with Streamlit.'''
configuration = {
'nlp_engine_name': 'spacy',
'models': [
{'lang_code': 'en', 'model_name': 'en_student_name_detector'}],
}
return CustomAnalyzer(configuration=configuration)
@st.cache(allow_output_mutation=True)
def anonymizer_engine():
'''Return generate surrogate anonymizer.'''
return SurrogateAnonymizer()
def annotate(text, st_analyze_results, st_entities):
tokens = []
# sort by start index
results = sorted(st_analyze_results, key=lambda x: x.start)
for i, res in enumerate(results):
if i == 0:
tokens.append(text[:res.start])
# append entity text and entity type
tokens.append((text[res.start: res.end], res.entity_type))
# if another entity coming i.e. we're not at the last results element, add text up to next entity
if i != len(results) - 1:
tokens.append(text[res.end:results[i+1].start])
# if no more entities coming, add all remaining text
else:
tokens.append(text[res.end:])
return tokens
st.set_page_config(page_title='Student Name Detector (English)', layout='wide')
# Side bar
st.sidebar.image('logo.png')
st.sidebar.markdown(
'''Detect and anonymize PII in text using an [NLP model](https://huggingface.co/langdonholmes/en_student_name_detector) [trained](https://github.com/aialoe/deidentification-pipeline) on student-generated text collected from a massive online open-enrollment course.
'''
)
st_entities = st.sidebar.multiselect(
label='Which entities to look for?',
options=analyzer_engine().get_supported_entities(),
default=list(analyzer_engine().get_supported_entities()),
)
st_threshold = st.sidebar.slider(
label='Acceptance threshold', min_value=0.0, max_value=1.0, value=0.35
)
st_return_decision_process = st.sidebar.checkbox(
'Add analysis explanations in json')
st.sidebar.info(
'This is part of a project to develop new anonymization systems that are appropriate for student-generated text.'
)
# Main panel
analyzer_load_state = st.info(
'Starting Presidio analyzer and loading Longformer-based model...')
engine = analyzer_engine()
analyzer_load_state.empty()
st_text = st.text_area(
label='Type in some text',
value='Learning Reflection\n\nWritten by John Williams and Samantha Morales\n\nIn this course I learned many things. As Liedtke (2004) said, \"Students grow when they learn\" (Erickson et al. 1998).\n\nBy John H. Williams -- (714) 328-9989 -- johnwilliams@yahoo.com',
height=200,
)
button = st.button('Detect PII')
if 'first_load' not in st.session_state:
st.session_state['first_load'] = True
# After
st.subheader('Analyzed')
with st.spinner('Analyzing...'):
if button or st.session_state.first_load:
st_analyze_results = analyzer_engine().analyze(
text=st_text,
entities=st_entities,
language='en',
score_threshold=st_threshold,
return_decision_process=st_return_decision_process,
)
annotated_tokens = annotate(st_text, st_analyze_results, st_entities)
# annotated_tokens
annotated_text(*annotated_tokens)
# vertical space
st.text('')
st.subheader('Anonymized')
with st.spinner('Anonymizing...'):
if button or st.session_state.first_load:
st_anonymize_results = anonymizer_engine().anonymize(
st_text,
st_analyze_results)
st_anonymize_results
# table result
st.subheader('Detailed Findings')
if st_analyze_results:
res_dicts = [r.to_dict() for r in st_analyze_results]
for d in res_dicts:
d['Value'] = st_text[d['start']:d['end']]
df = pd.DataFrame.from_records(res_dicts)
df = df[['entity_type', 'Value', 'score', 'start', 'end']].rename(
{
'entity_type': 'Entity type',
'start': 'Start',
'end': 'End',
'score': 'Confidence',
},
axis=1,
)
st.dataframe(df, width=1000)
else:
st.text('No findings')
st.session_state['first_load'] = True
# json result
class ToDictListEncoder(JSONEncoder):
'''Encode dict to json.'''
def default(self, o):
'''Encode to JSON using to_dict.'''
if o:
return o.to_dict()
return []
if st_return_decision_process:
st.json(json.dumps(st_analyze_results, cls=ToDictListEncoder))