import spacy import streamlit as st import re import logging from presidio_anonymizer import AnonymizerEngine from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult, EntityRecognizer from annotated_text import annotated_text from flair_recognizer import FlairRecognizer from detoxify import Detoxify ############################### #### Render Streamlit page #### ############################### st.title("Anonymise your text!") st.markdown( "This mini-app anonymises text using Flair and Presidio. You can find the code in the Files and Versions tabs in the [HuggingFace page](https://huggingface.co/spaces/arogeriogel/anonymise_this)" ) # Configure logger logging.basicConfig(format="\n%(asctime)s\n%(message)s", level=logging.INFO, force=True) ############################## ###### Define functions ###### ############################## # @st.cache_resource(show_spinner="Fetching model from cache...") @st.cache(allow_output_mutation=True) def analyzer_engine(): """Return AnalyzerEngine.""" analyzer = AnalyzerEngine() flair_recognizer = FlairRecognizer() analyzer.registry.add_recognizer(flair_recognizer) return analyzer def analyze(**kwargs): """Analyze input using Analyzer engine and input arguments (kwargs).""" if "entities" not in kwargs or "All" in kwargs["entities"]: kwargs["entities"] = None results = analyzer_engine().analyze(**kwargs) st.session_state.analyze_results = results def annotate(): text = st.session_state.text analyze_results = st.session_state.analyze_results tokens = [] starts=[] # sort by start index results = sorted(analyze_results, key=lambda x: x.start) for i, res in enumerate(results): # if we already have an entity for this token don't add another if res.start not in starts: if i == 0: tokens.append(text[:res.start]) # append entity text and entity type tokens.append((text[res.start: res.end], res.entity_type)) # if another entity coming i.e. we're not at the last results element, add text up to next entity if i != len(results) - 1: tokens.append(text[res.end:results[i+1].start]) # if no more entities coming, add all remaining text else: tokens.append(text[res.end:]) # append this token to the list so we don't repeat results per token starts.append(res.start) return tokens def get_supported_entities(): """Return supported entities from the Analyzer Engine.""" return analyzer_engine().get_supported_entities() def analyze_text(): if not st.session_state.text: st.session_state.text_error = "Please enter your text" return toxicity_results = Detoxify('original').predict(st.session_state.text) is_toxic=False for k in toxicity_results.keys(): for k in toxicity_results.keys(): if k!='toxicity': if toxicity_results[k]>0.5: is_toxic=True else: if toxicity_results[k]>0.65: is_toxic=True if is_toxic: st.session_state.text_error = "Your text entry was detected as toxic, please re-write it." return else: with text_spinner_placeholder: with st.spinner("Please wait while your text is being analysed..."): logging.info(f"This is the text being analysed: {st.session_state.text}") st.session_state.text_error = "" st.session_state.n_requests += 1 analyze( text=st.session_state.text, entities=st_entities, language="en", return_decision_process=False, ) if st.session_state.excluded_words: exclude_manual_input() if st.session_state.allowed_words: allow_manual_input() logging.info( f"analyse results: {st.session_state.analyze_results}\n" ) def exclude_manual_input(): deny_list = [i.strip() for i in st.session_state.excluded_words.split(',')] def _deny_list_to_regex(deny_list): """ Convert a list of words to a matching regex. To be analyzed by the analyze method as any other regex patterns. :param deny_list: the list of words to detect :return:the regex of the words for detection """ # Escape deny list elements as preparation for regex escaped_deny_list = [re.escape(element) for element in deny_list] regex = r"(?:^|(?<=\W))(" + "|".join(escaped_deny_list) + r")(?:(?=\W)|$)" return regex deny_list_pattern = _deny_list_to_regex(deny_list) matches = re.finditer(deny_list_pattern, st.session_state.text) results = [] for match in matches: start, end = match.span() current_match = st.session_state.text[start:end] # Skip empty results if current_match == "": continue pattern_result = RecognizerResult( entity_type='MANUALLY ADDED', start=start, end=end, score=1.0, ) # check if already in detected strings found=False for token in st.session_state.analyze_results: if token.start==start and token.end==end: found=True if found==False: results.append(pattern_result) results = EntityRecognizer.remove_duplicates(results) st.session_state.analyze_results.extend(results) logging.info( f"analyse results after adding excluded words: {st.session_state.analyze_results}\n" ) def allow_manual_input(): analyze_results_fltered=[] for token in st.session_state.analyze_results: if st.session_state.text[token.start:token.end] not in st.session_state.allowed_words: analyze_results_fltered.append(token) logging.info( f"analyse results after removing allowed words: {analyze_results_fltered}\n" ) st.session_state.analyze_results = analyze_results_fltered # @st.cache_resource(show_spinner="Fetching model from cache...") @st.cache(allow_output_mutation=True) def anonymizer_engine(): """Return AnonymizerEngine.""" return AnonymizerEngine() def anonymise_text(): if st.session_state.n_requests >= 50: st.session_state.text_error = "Too many requests. Please wait a few seconds before anonymising more text." logging.info(f"Session request limit reached: {st.session_state.n_requests}") st.session_state.n_requests = 1 st.session_state.text_error = "" if not st.session_state.text: st.session_state.text_error = "Please enter your text" return if not st.session_state.analyze_results: analyze_text() with text_spinner_placeholder: with st.spinner("Please wait while your text is being anonymised..."): anon_results = anonymizer_engine().anonymize(st.session_state.text, st.session_state.analyze_results) st.session_state.text_error = "" st.session_state.n_requests += 1 st.session_state.anon_results = anon_results logging.info( f"text anonymised: {st.session_state.anon_results}" ) def clear_results(): st.session_state.anon_results="" st.session_state.analyze_results="" ####################################### #### Initialize "global" variables #### ####################################### if "text_error" not in st.session_state: st.session_state.text_error = "" if "analyze_results" not in st.session_state: st.session_state.analyze_results = "" if "anon_results" not in st.session_state: st.session_state.anon_results = "" if "n_requests" not in st.session_state: st.session_state.n_requests = 0 ############################## ####### Page arguments ####### ############################## # Every widget with a key is automatically added to Session State as a global variable. # In Streamlit, interacting with a widget triggers a rerun and variables defined # in the code get reinitialized after each rerun. # If a callback function is associated with a widget then a change in the widget # triggers the following sequence: First the callback function is executed and then # the app executes from top to bottom. st.text_input( label="Text", placeholder="Write your text here", key='text', on_change=clear_results ) st.text_input( label="Data to be redacted (optional)", placeholder="John, Mary, London", key='excluded_words', on_change=clear_results ) st.text_input( label="Data to be ignored (optional)", placeholder="NHS, GEL, Lab", key='allowed_words', on_change=clear_results ) st_entities = st.sidebar.multiselect( label="Which entities to look for?", options=get_supported_entities(), default=list(get_supported_entities()), ) ############################## ######## Page buttons ######## ############################## # button return true when clicked col1, col2 = st.columns(2) analyze_now=False with col1: analyze_now = st.button( label="Analyse text", type="primary", on_click=analyze_text, ) anonymise_now=False with col2: anonymise_now = st.button( label="Anonymise text", type="primary", on_click=anonymise_text, ) ############################## ######## Page actions ######## ############################## text_spinner_placeholder = st.empty() if st.session_state.text_error: st.error(st.session_state.text_error) with col1: if st.session_state.analyze_results: annotated_tokens=annotate() annotated_text(*annotated_tokens) st.write(st.session_state.analyze_results) if not st.session_state.analyze_results and analyze_now and not st.session_state.text_error: st.write("### No PII was found. ###") with col2: if st.session_state.anon_results: st.write(st.session_state.anon_results.text) if not st.session_state.analyze_results and anonymise_now and not st.session_state.text_error: st.write("### No PII was found. ###")