import spacy import streamlit as st from flair.data import Sentence from flair.models import SequenceTagger import re import logging from presidio_analyzer.nlp_engine import NlpEngineProvider from presidio_anonymizer import AnonymizerEngine from presidio_analyzer import AnalyzerEngine, RecognizerRegistry from annotated_text import annotated_text from flair_recognizer import FlairRecognizer # Render Streamlit page st.title("Anonymise your text!") st.markdown( "This mini-app anonymises text using Flair. You can find the code on [GitHub(WIP)](#)" ) # Configure logger logging.basicConfig(format="\n%(asctime)s\n%(message)s", level=logging.INFO, force=True) @st.cache(suppress_st_warning=True, allow_output_mutation=True, show_spinner=False) def load_tagger(): return SequenceTagger.load("flair/ner-english-large") @st.cache(allow_output_mutation=True,show_spinner=False) def analyzer_engine(): """Return AnalyzerEngine.""" # registry = RecognizerRegistry() # flair_recognizer = FlairRecognizer() # registry.load_predefined_recognizers() # registry.add_recognizer(flair_recognizer) # analyzer = AnalyzerEngine(registry=registry, supported_languages=["en"]) analyzer = AnalyzerEngine() flair_recognizer = FlairRecognizer() analyzer.registry.add_recognizer(flair_recognizer) return analyzer def analyze(**kwargs): """Analyze input using Analyzer engine and input arguments (kwargs).""" if "entities" not in kwargs or "All" in kwargs["entities"]: kwargs["entities"] = None return analyzer_engine().analyze(**kwargs) def annotate(text, analyze_results,st_entities): tokens = [] # sort by start index results = sorted(analyze_results, key=lambda x: x.start) for i, res in enumerate(results): if i == 0: tokens.append(text[:res.start]) # append entity text and entity type tokens.append((text[res.start: res.end], res.entity_type)) # if another entity coming i.e. we're not at the last results element, add text up to next entity if i != len(results) - 1: tokens.append(text[res.end:results[i+1].start]) # if no more entities coming, add all remaining text else: tokens.append(text[res.end:]) return tokens def get_supported_entities(): """Return supported entities from the Analyzer Engine.""" return analyzer_engine().get_supported_entities() st_entities = st.sidebar.multiselect( label="Which entities to look for?", options=get_supported_entities(), default=list(get_supported_entities()), ) def analyze_text(text: str, st_entities: str): if not text: st.session_state.text_error = "Please enter your text" return with text_spinner_placeholder: with st.spinner("Please wait while your text is being analysed..."): logging.info(f"This is the text being analysed: {text}") analyze_results = analyze( text=text, entities=st_entities, language="en", return_decision_process=False, ) st.session_state.annotated_tokens = annotate(text, analyze_results,st_entities) # st.session_state.text_analys=annotated_text(*annotated_tokens) logging.info( f"text: {text}{metadata}{white_listed_words}\n" f"tokens: {st.session_state.annotated_tokens}\n" ) def anonymise_text(text: str, metadata: str = "", white_listed_words: str = ""): """anonymise text""" if st.session_state.n_requests >= 50: st.session_state.text_error = "Too many requests. Please wait a few seconds before anonymising more text." logging.info(f"Session request limit reached: {st.session_state.n_requests}") st.session_state.n_requests = 1 return st.session_state.text = "" st.session_state.text_error = "" if not text: st.session_state.text_error = "Please enter your text" return with text_spinner_placeholder: with st.spinner("Please wait while your text is being anonymised..."): # flagged = openai.moderate(prompt) # if flagged: # st.session_state.text_error = "Input flagged as inappropriate." # logging.info(f"Topic: {topic}{mood_output}{style_output}\n") # return # else: # load tagger tagger = load_tagger() # tagger = load_tagger() sentence = Sentence(text) # predict NER tags tagger.predict(sentence) # iterate over entities and redact enitities=[e.text for e in sentence.get_spans('ner')] regex = re.compile('|'.join(map(re.escape, enitities))) text_anon = regex.sub("", text) st.session_state.text_error = "" st.session_state.n_requests += 1 st.session_state.text_anon = text_anon logging.info( f"text: {text}{metadata}{white_listed_words}\n" f"entities: {sentence.get_spans('ner')}\n" f"text anonymised: {st.session_state.text_anon}" ) if "text" not in st.session_state: st.session_state.text = "" if "text_error" not in st.session_state: st.session_state.text_error = "" if "annotated_tokens" not in st.session_state: st.session_state.annotated_tokens = "" if "text_anon" not in st.session_state: st.session_state.text_anon = "" if "n_requests" not in st.session_state: st.session_state.n_requests = 0 text = st.text_input(label="Text to be anonymised", placeholder="Write your text here") metadata = st.text_input( label="Data to be redacted (optional)", placeholder="inspirational", ) white_listed_words = st.text_input( label="Data to be ignored (optional)", placeholder="inspirational", ) # button return true when clicked analyze_now = st.button( label="Analyse text", type="primary", on_click=analyze_text, args=(text,st_entities,), ) # button return true when clicked anonymise_now = st.button( label="Anonymise text", type="primary", on_click=anonymise_text, args=(text, metadata, white_listed_words), ) text_spinner_placeholder = st.empty() if st.session_state.text_error: st.error(st.session_state.text_error) if analyze_now: # annotated_tokens annotated_text(*st.session_state.annotated_tokens) if st.session_state.text_anon: st.markdown("""---""") st.text_area(label="Text anonymised", value=st.session_state.text_anon, height=100)