"""Streamlit app for Presidio.""" import logging import os import traceback import dotenv import pandas as pd import streamlit as st import streamlit.components.v1 as components from annotated_text import annotated_text from streamlit_tags import st_tags # from openai_fake_data_generator import OpenAIParams from presidio_helpers import ( get_supported_entities, analyze, anonymize, annotate, # create_fake_data, analyzer_engine, ) st.set_page_config( page_title="Presidio demo", layout="wide", initial_sidebar_state="expanded", # menu_items={ # "About": "https://microsoft.github.io/presidio/", # }, ) dotenv.load_dotenv() logger = logging.getLogger("presidio-streamlit") allow_other_models = os.getenv("ALLOW_OTHER_MODELS", False) # Sidebar st.sidebar.header( """ Personal Info Anonymization """ ) # set aliae logo st.sidebar.image('logo.png', use_column_width=True) model_help_text = """ Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers. Presidio supports multiple NER packages off-the-shelf, such as spaCy, Huggingface, Stanza and Flair, as well as service such as Azure Text Analytics PII. """ st_ta_key = st_ta_endpoint = "" model_list = [ "spaCy/en_core_web_lg", "spaCy/fr_core_news_lg", ] # "flair/ner-english-large", # # "HuggingFace/StanfordAIMI/stanford-deidentifier-base", # "Azure Text Analytics PII", # "Other", # if not allow_other_models: # model_list.pop() # Select model lang = st.sidebar.selectbox( "Language", ['en','fr'], index=0, ) # Extract model package. # st_model_package = st_model.split("/")[0] # # Remove package prefix (if needed) # st_model = ( # st_model # if st_model_package not in ("spaCy", "HuggingFace") # else "/".join(st_model.split("/")[1:]) # ) st_model = 'en_core_web_lg' st_model_package = "spaCy" if lang =='en': st_model_package = "spaCy" st_model = 'en_core_web_lg' elif lang == 'fr' : st_model_package = "HuggingFace" st_model = 'fr_core_news_lg' # if st_model == "Other": # st_model_package = st.sidebar.selectbox( # "NER model OSS package", options=["spaCy", "Flair", "HuggingFace"] # ) # st_model = st.sidebar.text_input(f"NER model name", value="") # if st_model == "Azure Text Analytics PII": # st_ta_key = st.sidebar.text_input( # f"Text Analytics key", value=os.getenv("TA_KEY", ""), type="password" # ) # st_ta_endpoint = st.sidebar.text_input( # f"Text Analytics endpoint", # value=os.getenv("TA_ENDPOINT", default=""), # help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview", # noqa: E501 # ) # st.sidebar.warning("Note: Models might take some time to download. ") analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint) logger.debug(f"analyzer_params: {analyzer_params}") st_operator = st.sidebar.selectbox( "De-identification approach", ["redact", "replace", "highlight"], index=2, help=""" Select which manipulation to the text is requested after PII has been identified.\n - Redact: Completely remove the PII text\n - Replace: Replace the PII text with a constant, e.g. \n - Highlight: Shows the original text with PII highlighted in colors\n """, ) st_mask_char = "*" st_number_of_chars = 15 st_encrypt_key = "WmZq4t7w!z%C&F)J" open_ai_params = None logger.debug(f"st_operator: {st_operator}") # if st_operator == "mask": # st_number_of_chars = st.sidebar.number_input( # "number of chars", value=st_number_of_chars, min_value=0, max_value=100 # ) # st_mask_char = st.sidebar.text_input( # "Mask character", value=st_mask_char, max_chars=1 # ) # elif st_operator == "encrypt": # st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key) # elif st_operator == "synthesize": # if os.getenv("OPENAI_TYPE", default="openai") == "Azure": # openai_api_type = "azure" # st_openai_api_base = st.sidebar.text_input( # "Azure OpenAI base URL", # value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""), # ) # st_deployment_name = st.sidebar.text_input( # "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="") # ) # st_openai_version = st.sidebar.text_input( # "OpenAI version", # value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"), # ) # else: # st_openai_version = openai_api_type = st_openai_api_base = None # st_deployment_name = "" # st_openai_key = st.sidebar.text_input( # "OPENAI_KEY", # value=os.getenv("OPENAI_KEY", default=""), # help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.", # type="password", # ) # st_openai_model = st.sidebar.text_input( # "OpenAI model for text synthesis", # value=os.getenv("OPENAI_MODEL", default="text-davinci-003"), # help="See more here: https://platform.openai.com/docs/models/", # ) # # open_ai_params = OpenAIParams( # openai_key=st_openai_key, # model=st_openai_model, # api_base=st_openai_api_base, # deployment_name=st_deployment_name, # api_version=st_openai_version, # api_type=openai_api_type, # ) # st_threshold = st.sidebar.slider( # label="Acceptance threshold", # min_value=0.0, # max_value=1.0, # value=0.35, # help="Define the threshold for accepting a detection as PII. See more here: ", # ) st_threshold = 0.35 # # st_return_decision_process = st.sidebar.checkbox( # "Add analysis explanations to findings", # value=False, # help="Add the decision process to the output table. " # "More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/", # ) st_return_decision_process = False # # Allow and deny lists # st_deny_allow_expander = st.sidebar.expander( # "Allowlists and denylists", # expanded=False, # ) # # with st_deny_allow_expander: # st_allow_list = st_tags( # label="Add words to the allowlist", text="Enter word and press enter." # ) # st.caption( # "Allowlists contain words that are not considered PII, but are detected as such." # ) # # st_deny_list = st_tags( # label="Add words to the denylist", text="Enter word and press enter." # ) # st.caption( # "Denylists contain words that are considered PII, but are not detected as such." # ) st_allow_list = [] st_deny_list = [] # Main panel with st.expander("About Microsoft Presidio", expanded=False): st.info( """Presidio is an open source customizable framework for PII detection and de-identification.""" ) analyzer_load_state = st.info("Starting Presidio analyzer...") analyzer_load_state.empty() # Read default text with open("en_demo_text.txt") as f: en_demo_text = f.readlines() with open("fr_demo_text.txt") as f: fr_demo_text = f.readlines() if lang == 'en': demo_text = en_demo_text elif lang == 'fr': demo_text = fr_demo_text # Create two columns for before and after col1, col2 = st.columns(2) # Before: col1.subheader("Input") st_text = col1.text_area( label="Enter text", value="".join(demo_text), height=400, key="text_input" ) try: # Choose entities st_entities_expander = st.sidebar.expander("Choose entities to look for") st_entities = st_entities_expander.multiselect( label="Which entities to look for?", options=get_supported_entities(*analyzer_params), default=list(get_supported_entities(*analyzer_params)), help="Limit the list of PII entities detected. " "This list is dynamic and based on the NER model and registered recognizers. " "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/", ) # Before analyzer_load_state = st.info("Starting Presidio analyzer...") analyzer = analyzer_engine(*analyzer_params) analyzer_load_state.empty() st_analyze_results = analyze( *analyzer_params, text=st_text, entities=st_entities, language=lang, score_threshold=st_threshold, return_decision_process=st_return_decision_process, allow_list=st_allow_list, deny_list=st_deny_list, ) # After if st_operator not in ("highlight", "synthesize"): with col2: st.subheader(f"Output") st_anonymize_results = anonymize( text=st_text, operator=st_operator, mask_char=st_mask_char, number_of_chars=st_number_of_chars, encrypt_key=st_encrypt_key, analyze_results=st_analyze_results, ) st.text_area( label="De-identified", value=st_anonymize_results.text, height=400 ) # elif st_operator == "synthesize": # with col2: # st.subheader(f"OpenAI Generated output") # fake_data = create_fake_data( # st_text, # st_analyze_results, # open_ai_params, # ) # st.text_area(label="Synthetic data", value=fake_data, height=400) else: st.subheader("Highlighted") annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results) # annotated_tokens annotated_text(*annotated_tokens) # table result st.subheader( "Findings" if not st_return_decision_process else "Findings with decision factors" ) if st_analyze_results: df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results]) df["text"] = [st_text[res.start : res.end] for res in st_analyze_results] df_subset = df[["entity_type", "text", "start", "end", "score"]].rename( { "entity_type": "Entity type", "text": "Text", "start": "Start", "end": "End", "score": "Confidence", }, axis=1, ) df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results] if st_return_decision_process: analysis_explanation_df = pd.DataFrame.from_records( [r.analysis_explanation.to_dict() for r in st_analyze_results] ) df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1) st.dataframe(df_subset.reset_index(drop=True), use_container_width=True) else: st.text("No findings") except Exception as e: print(e) traceback.print_exc() st.error(e) components.html( """ """ )