Spaces:

omri374
/

presidio

Running

App Files Files Community

omri374 commited on Dec 8, 2023

Commit

41e004f

•

1 Parent(s): b75d1f1

Upload 12 files

Browse files

Files changed (9) hide show

azure_ai_language_wrapper.py +126 -0
flair_recognizer.py +5 -5
flair_test.py +25 -0
index.md +15 -5
openai_fake_data_generator.py +12 -8
presidio_helpers.py +9 -6
presidio_nlp_engine_config.py +118 -40
presidio_streamlit.py +41 -17
test_streamlit.py +43 -0

azure_ai_language_wrapper.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import os
+from typing import List, Optional
+import logging
+import dotenv
+from azure.ai.textanalytics import TextAnalyticsClient
+from azure.core.credentials import AzureKeyCredential
+from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
+from presidio_analyzer.nlp_engine import NlpArtifacts
+logger = logging.getLogger("presidio-streamlit")
+class AzureAIServiceWrapper(EntityRecognizer):
+    from azure.ai.textanalytics._models import PiiEntityCategory
+    TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
+    def __init__(
+        self,
+        supported_entities: Optional[List[str]] = None,
+        supported_language: str = "en",
+        ta_client: Optional[TextAnalyticsClient] = None,
+        ta_key: Optional[str] = None,
+        ta_endpoint: Optional[str] = None,
+    ):
+        """
+        Wrapper for the Azure Text Analytics client
+        :param ta_client: object of type TextAnalyticsClient
+        :param ta_key: Azure cognitive Services for Language key
+        :param ta_endpoint: Azure cognitive Services for Language endpoint
+        """
+        if not supported_entities:
+            supported_entities = self.TA_SUPPORTED_ENTITIES
+        super().__init__(
+            supported_entities=supported_entities,
+            supported_language=supported_language,
+            name="Azure AI Language PII",
+        )
+        self.ta_key = ta_key
+        self.ta_endpoint = ta_endpoint
+        if not ta_client:
+            ta_client = self.__authenticate_client(ta_key, ta_endpoint)
+        self.ta_client = ta_client
+    @staticmethod
+    def __authenticate_client(key: str, endpoint: str):
+        ta_credential = AzureKeyCredential(key)
+        text_analytics_client = TextAnalyticsClient(
+            endpoint=endpoint, credential=ta_credential
+        )
+        return text_analytics_client
+    def analyze(
+        self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
+    ) -> List[RecognizerResult]:
+        if not entities:
+            entities = []
+        response = self.ta_client.recognize_pii_entities(
+            [text], language=self.supported_language
+        )
+        results = [doc for doc in response if not doc.is_error]
+        recognizer_results = []
+        for res in results:
+            for entity in res.entities:
+                if entity.category not in self.supported_entities:
+                    continue
+                analysis_explanation = AzureAIServiceWrapper._build_explanation(
+                    original_score=entity.confidence_score,
+                    entity_type=entity.category,
+                )
+                recognizer_results.append(
+                    RecognizerResult(
+                        entity_type=entity.category,
+                        start=entity.offset,
+                        end=entity.offset + len(entity.text),
+                        score=entity.confidence_score,
+                        analysis_explanation=analysis_explanation,
+                    )
+                )
+        return recognizer_results
+    @staticmethod
+    def _build_explanation(
+        original_score: float, entity_type: str
+    ) -> AnalysisExplanation:
+        explanation = AnalysisExplanation(
+            recognizer=AzureAIServiceWrapper.__class__.__name__,
+            original_score=original_score,
+            textual_explanation=f"Identified as {entity_type} by Text Analytics",
+        )
+        return explanation
+    def load(self) -> None:
+        pass
+if __name__ == "__main__":
+    import presidio_helpers
+    dotenv.load_dotenv()
+    text = """
+    Here are a few example sentences we currently support:
+    Hello, my name is David Johnson and I live in Maine.
+    My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
+    On September 18 I visited microsoft.com and sent an email to test@presidio.site,  from the IP 192.168.0.1.
+    My passport: 191280342 and my phone number: (212) 555-1234.
+    This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
+    Kate's social security number is 078-05-1126.  Her driver license? it is 1234567A.
+    """
+    analyzer = presidio_helpers.analyzer_engine(
+        model_path="Azure Text Analytics PII",
+        ta_key=os.environ["TA_KEY"],
+        ta_endpoint=os.environ["TA_ENDPOINT"],
+    )
+    analyzer.analyze(text=text, language="en")

flair_recognizer.py CHANGED Viewed

@@ -59,9 +59,7 @@ class FlairRecognizer(EntityRecognizer):
         # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
     ]
-    MODEL_LANGUAGES = {
-        "en": "flair/ner-english-large"
-    }
     PRESIDIO_EQUIVALENCES = {
         "PER": "PERSON",
@@ -76,7 +74,7 @@ class FlairRecognizer(EntityRecognizer):
         supported_entities: Optional[List[str]] = None,
         check_label_groups: Optional[Tuple[Set, Set]] = None,
         model: SequenceTagger = None,
-        model_path: Optional[str] = None
     ):
         self.check_label_groups = (
             check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
@@ -93,7 +91,9 @@ class FlairRecognizer(EntityRecognizer):
             self.model = SequenceTagger.load(model_path)
         else:
             print(f"Loading model for language {supported_language}")
-            self.model = SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language))
         super().__init__(
             supported_entities=supported_entities,

         # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
     ]
+    MODEL_LANGUAGES = {"en": "flair/ner-english-large"}
     PRESIDIO_EQUIVALENCES = {
         "PER": "PERSON",
         supported_entities: Optional[List[str]] = None,
         check_label_groups: Optional[Tuple[Set, Set]] = None,
         model: SequenceTagger = None,
+        model_path: Optional[str] = None,
     ):
         self.check_label_groups = (
             check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
             self.model = SequenceTagger.load(model_path)
         else:
             print(f"Loading model for language {supported_language}")
+            self.model = SequenceTagger.load(
+                self.MODEL_LANGUAGES.get(supported_language)
+            )
         super().__init__(
             supported_entities=supported_entities,

flair_test.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Import generic wrappers
+from transformers import AutoModel, AutoTokenizer
+if __name__ == "__main__":
+    from flair.data import Sentence
+    from flair.models import SequenceTagger
+    # load tagger
+    tagger = SequenceTagger.load("flair/ner-english-large")
+    # make example sentence
+    sentence = Sentence("George Washington went to Washington")
+    # predict NER tags
+    tagger.predict(sentence)
+    # print sentence
+    print(sentence)
+    # print predicted NER spans
+    print("The following NER tags are found:")
+    # iterate over entities and print
+    for entity in sentence.get_spans("ner"):
+        print(entity)

index.md CHANGED Viewed

@@ -5,22 +5,32 @@ The app is based on the [streamlit](https://streamlit.io/) package.
 A live version can be found here: https://huggingface.co/spaces/presidio/presidio_demo
 ## Requirements
-1. Clone the repo and move to the `docs/samples/python/streamlit ` folder
-1. Install dependencies (preferably in a virtual environment)
 ```sh
 pip install -r requirements
 ```
 > Note: This would install additional packages such as `transformers` and `flair` which are not mandatory for using Presidio.
-2.
 3. *Optional*: Update the `analyzer_engine` and `anonymizer_engine` functions for your specific implementation (in `presidio_helpers.py`).
-3. Start the app:
 ```sh
 streamlit run presidio_streamlit.py
 ```
 ## Output
 Output should be similar to this screenshot:
-![image](https://user-images.githubusercontent.com/3776619/232289541-d59992e1-52a4-44c1-b904-b22c72c02a5b.png)

 A live version can be found here: https://huggingface.co/spaces/presidio/presidio_demo
 ## Requirements
+1. Clone the repo and move to the `docs/samples/python/streamlit` folder
+2. Install dependencies (preferably in a virtual environment)
 ```sh
 pip install -r requirements
 ```
 > Note: This would install additional packages such as `transformers` and `flair` which are not mandatory for using Presidio.
 3. *Optional*: Update the `analyzer_engine` and `anonymizer_engine` functions for your specific implementation (in `presidio_helpers.py`).
+4. Start the app:
 ```sh
 streamlit run presidio_streamlit.py
 ```
+5. Consider adding an `.env` file with the following environment variables, for further customizability:
+```sh
+TA_KEY=YOUR_TEXT_ANALYTICS_KEY
+TA_ENDPOINT=YOUR_TEXT_ANALYTICS_ENDPOINT
+OPENAI_TYPE="Azure" #or "openai"
+OPENAI_KEY=YOUR_OPENAI_KEY
+OPENAI_API_VERSION = "2023-05-15"
+AZURE_OPENAI_ENDPOINT=YOUR_AZURE_OPENAI_AZURE_OPENAI_ENDPOINT
+AZURE_OPENAI_DEPLOYMENT=text-davinci-003
+ALLOW_OTHER_MODELS=true #true if the user could download new models
+```
 ## Output
 Output should be similar to this screenshot:
+![image](https://github.com/microsoft/presidio/assets/3776619/7d0eadf1-e750-4747-8b59-8203aa43cac8)

openai_fake_data_generator.py CHANGED Viewed

@@ -39,7 +39,10 @@ def call_completion_model(
     """
     if deployment_id:
         response = openai.Completion.create(
-            deployment_id=deployment_id, model=model, prompt=prompt, max_tokens=max_tokens
         )
     else:
         response = openai.Completion.create(
@@ -64,17 +67,18 @@ def create_prompt(anonymized_text: str) -> str:
     a. Use completely random numbers, so every digit is drawn between 0 and 9.
     b. Use realistic names that come from diverse genders, ethnicities and countries.
-    c. If there are no placeholders, return the text as is and provide an answer.
     d. Keep the formatting as close to the original as possible.
     e. If PII exists in the input, replace it with fake values in the output.
-    input: How do I change the limit on my credit card {{credit_card_number}}?
     output: How do I change the limit on my credit card 2539 3519 2345 1555?
-    input: <PERSON> was the chief science officer at <ORGANIZATION>.
     output: Katherine Buckjov was the chief science officer at NASA.
-    input: Cameroon lives in <LOCATION>.
     output: Vladimir lives in Moscow.
-    input: {anonymized_text}
-    output:
-    """
     return prompt

     """
     if deployment_id:
         response = openai.Completion.create(
+            deployment_id=deployment_id,
+            model=model,
+            prompt=prompt,
+            max_tokens=max_tokens,
         )
     else:
         response = openai.Completion.create(
     a. Use completely random numbers, so every digit is drawn between 0 and 9.
     b. Use realistic names that come from diverse genders, ethnicities and countries.
+    c. If there are no placeholders, return the text as is.
     d. Keep the formatting as close to the original as possible.
     e. If PII exists in the input, replace it with fake values in the output.
+    f. Remove whitespace before and after the generated text
+    input: [[TEXT STARTS]] How do I change the limit on my credit card {{credit_card_number}}?[[TEXT ENDS]]
     output: How do I change the limit on my credit card 2539 3519 2345 1555?
+    input: [[TEXT STARTS]]<PERSON> was the chief science officer at <ORGANIZATION>.[[TEXT ENDS]]
     output: Katherine Buckjov was the chief science officer at NASA.
+    input: [[TEXT STARTS]]Cameroon lives in <LOCATION>.[[TEXT ENDS]]
     output: Vladimir lives in Moscow.
+    input: [[TEXT STARTS]]{anonymized_text}[[TEXT ENDS]]
+    output:"""
     return prompt

presidio_helpers.py CHANGED Viewed

@@ -25,7 +25,8 @@ from presidio_nlp_engine_config import (
     create_nlp_engine_with_spacy,
     create_nlp_engine_with_flair,
     create_nlp_engine_with_transformers,
-    create_nlp_engine_with_azure_text_analytics,
 )
 logger = logging.getLogger("presidio-streamlit")
@@ -49,14 +50,16 @@ def nlp_engine_and_registry(
     """
     # Set up NLP Engine according to the model of choice
-    if "spaCy" in model_family:
         return create_nlp_engine_with_spacy(model_path)
-    elif "flair" in model_family:
         return create_nlp_engine_with_flair(model_path)
-    elif "HuggingFace" in model_family:
         return create_nlp_engine_with_transformers(model_path)
-    elif "Azure Text Analytics" in model_family:
-        return create_nlp_engine_with_azure_text_analytics(ta_key, ta_endpoint)
     else:
         raise ValueError(f"Model family {model_family} not supported")

     create_nlp_engine_with_spacy,
     create_nlp_engine_with_flair,
     create_nlp_engine_with_transformers,
+    create_nlp_engine_with_azure_ai_language,
+    create_nlp_engine_with_stanza,
 )
 logger = logging.getLogger("presidio-streamlit")
     """
     # Set up NLP Engine according to the model of choice
+    if "spacy" in model_family.lower():
         return create_nlp_engine_with_spacy(model_path)
+    if "stanza" in model_family.lower():
+        return create_nlp_engine_with_stanza(model_path)
+    elif "flair" in model_family.lower():
         return create_nlp_engine_with_flair(model_path)
+    elif "huggingface" in model_family.lower():
         return create_nlp_engine_with_transformers(model_path)
+    elif "azure ai language" in model_family.lower():
+        return create_nlp_engine_with_azure_ai_language(ta_key, ta_endpoint)
     else:
         raise ValueError(f"Model family {model_family} not supported")

presidio_nlp_engine_config.py CHANGED Viewed

@@ -1,8 +1,12 @@
-from typing import Tuple
 import logging
 import spacy
 from presidio_analyzer import RecognizerRegistry
-from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
 logger = logging.getLogger("presidio-streamlit")
@@ -12,21 +16,70 @@ def create_nlp_engine_with_spacy(
 ) -> Tuple[NlpEngine, RecognizerRegistry]:
     """
     Instantiate an NlpEngine with a spaCy model
-    :param model_path: spaCy model path.
     """
     registry = RecognizerRegistry()
-    registry.load_predefined_recognizers()
-    if not spacy.util.is_package(model_path):
-        spacy.cli.download(model_path)
     nlp_configuration = {
-        "nlp_engine_name": "spacy",
         "models": [{"lang_code": "en", "model_name": model_path}],
     }
     nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
     return nlp_engine, registry
@@ -39,41 +92,62 @@ def create_nlp_engine_with_transformers(
     would return NlpArtifacts such as POS and lemmas.
     :param model_path: HuggingFace model path.
     """
-    from transformers_rec import (
-        STANFORD_COFIGURATION,
-        BERT_DEID_CONFIGURATION,
-        TransformersRecognizer,
-    )
-    registry = RecognizerRegistry()
-    registry.load_predefined_recognizers()
-    if not spacy.util.is_package("en_core_web_sm"):
-        spacy.cli.download("en_core_web_sm")
-    # Using a small spaCy model + a HF NER model
-    transformers_recognizer = TransformersRecognizer(model_path=model_path)
-    if model_path == "StanfordAIMI/stanford-deidentifier-base":
-        transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
-    elif model_path == "obi/deid_roberta_i2b2":
-        transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
-    else:
-        print(f"Warning: Model has no configuration, loading default.")
-        transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
-    # Use small spaCy model, no need for both spacy and HF models
-    # The transformers model is used here as a recognizer, not as an NlpEngine
     nlp_configuration = {
-        "nlp_engine_name": "spacy",
-        "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
     }
-    registry.add_recognizer(transformers_recognizer)
-    registry.remove_recognizer("SpacyRecognizer")
     nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
     return nlp_engine, registry
@@ -91,6 +165,8 @@ def create_nlp_engine_with_flair(
     registry = RecognizerRegistry()
     registry.load_predefined_recognizers()
     if not spacy.util.is_package("en_core_web_sm"):
         spacy.cli.download("en_core_web_sm")
     # Using a small spaCy model + a Flair NER model
@@ -107,7 +183,7 @@ def create_nlp_engine_with_flair(
     return nlp_engine, registry
-def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
     """
     Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
     The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
@@ -115,7 +191,7 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
     :param ta_key: Azure Text Analytics key.
     :param ta_endpoint: Azure Text Analytics endpoint.
     """
-    from text_analytics_wrapper import TextAnalyticsWrapper
     if not ta_key or not ta_endpoint:
         raise RuntimeError("Please fill in the Text Analytics endpoint details")
@@ -123,7 +199,9 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
     registry = RecognizerRegistry()
     registry.load_predefined_recognizers()
-    ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key)
     nlp_configuration = {
         "nlp_engine_name": "spacy",
         "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
@@ -131,7 +209,7 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
     nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
-    registry.add_recognizer(ta_recognizer)
     registry.remove_recognizer("SpacyRecognizer")
     return nlp_engine, registry

 import logging
+from typing import Tuple
 import spacy
 from presidio_analyzer import RecognizerRegistry
+from presidio_analyzer.nlp_engine import (
+    NlpEngine,
+    NlpEngineProvider,
+)
 logger = logging.getLogger("presidio-streamlit")
 ) -> Tuple[NlpEngine, RecognizerRegistry]:
     """
     Instantiate an NlpEngine with a spaCy model
+    :param model_path: path to model / model name.
     """
+    nlp_configuration = {
+        "nlp_engine_name": "spacy",
+        "models": [{"lang_code": "en", "model_name": model_path}],
+        "ner_model_configuration": {
+            "model_to_presidio_entity_mapping": {
+                "PER": "PERSON",
+                "PERSON": "PERSON",
+                "NORP": "NRP",
+                "FAC": "FACILITY",
+                "LOC": "LOCATION",
+                "GPE": "LOCATION",
+                "LOCATION": "LOCATION",
+                "ORG": "ORGANIZATION",
+                "ORGANIZATION": "ORGANIZATION",
+                "DATE": "DATE_TIME",
+                "TIME": "DATE_TIME",
+            },
+            "low_confidence_score_multiplier": 0.4,
+            "low_score_entity_names": ["ORG", "ORGANIZATION"],
+        },
+    }
+    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
     registry = RecognizerRegistry()
+    registry.load_predefined_recognizers(nlp_engine=nlp_engine)
+    return nlp_engine, registry
+def create_nlp_engine_with_stanza(
+    model_path: str,
+) -> Tuple[NlpEngine, RecognizerRegistry]:
+    """
+    Instantiate an NlpEngine with a stanza model
+    :param model_path: path to model / model name.
+    """
     nlp_configuration = {
+        "nlp_engine_name": "stanza",
         "models": [{"lang_code": "en", "model_name": model_path}],
+        "ner_model_configuration": {
+            "model_to_presidio_entity_mapping": {
+                "PER": "PERSON",
+                "PERSON": "PERSON",
+                "NORP": "NRP",
+                "FAC": "FACILITY",
+                "LOC": "LOCATION",
+                "GPE": "LOCATION",
+                "LOCATION": "LOCATION",
+                "ORG": "ORGANIZATION",
+                "ORGANIZATION": "ORGANIZATION",
+                "DATE": "DATE_TIME",
+                "TIME": "DATE_TIME",
+            }
+        },
     }
     nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers(nlp_engine=nlp_engine)
     return nlp_engine, registry
     would return NlpArtifacts such as POS and lemmas.
     :param model_path: HuggingFace model path.
     """
+    print(f"Loading Transformers model: {model_path} of type {type(model_path)}")
     nlp_configuration = {
+        "nlp_engine_name": "transformers",
+        "models": [
+            {
+                "lang_code": "en",
+                "model_name": {"spacy": "en_core_web_sm", "transformers": model_path},
+            }
+        ],
+        "ner_model_configuration": {
+            "model_to_presidio_entity_mapping": {
+                "PER": "PERSON",
+                "PERSON": "PERSON",
+                "LOC": "LOCATION",
+                "LOCATION": "LOCATION",
+                "GPE": "LOCATION",
+                "ORG": "ORGANIZATION",
+                "ORGANIZATION": "ORGANIZATION",
+                "NORP": "NRP",
+                "AGE": "AGE",
+                "ID": "ID",
+                "EMAIL": "EMAIL",
+                "PATIENT": "PERSON",
+                "STAFF": "PERSON",
+                "HOSP": "ORGANIZATION",
+                "PATORG": "ORGANIZATION",
+                "DATE": "DATE_TIME",
+                "TIME": "DATE_TIME",
+                "PHONE": "PHONE_NUMBER",
+                "HCW": "PERSON",
+                "HOSPITAL": "ORGANIZATION",
+                "FACILITY": "LOCATION",
+            },
+            "low_confidence_score_multiplier": 0.4,
+            "low_score_entity_names": ["ID"],
+            "labels_to_ignore": [
+                "CARDINAL",
+                "EVENT",
+                "LANGUAGE",
+                "LAW",
+                "MONEY",
+                "ORDINAL",
+                "PERCENT",
+                "PRODUCT",
+                "QUANTITY",
+                "WORK_OF_ART",
+            ],
+        },
     }
     nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers(nlp_engine=nlp_engine)
     return nlp_engine, registry
     registry = RecognizerRegistry()
     registry.load_predefined_recognizers()
+    # there is no official Flair NlpEngine, hence we load it as an additional recognizer
     if not spacy.util.is_package("en_core_web_sm"):
         spacy.cli.download("en_core_web_sm")
     # Using a small spaCy model + a Flair NER model
     return nlp_engine, registry
+def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
     """
     Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
     The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
     :param ta_key: Azure Text Analytics key.
     :param ta_endpoint: Azure Text Analytics endpoint.
     """
+    from azure_ai_language_wrapper import AzureAIServiceWrapper
     if not ta_key or not ta_endpoint:
         raise RuntimeError("Please fill in the Text Analytics endpoint details")
     registry = RecognizerRegistry()
     registry.load_predefined_recognizers()
+    azure_ai_language_recognizer = AzureAIServiceWrapper(
+        ta_endpoint=ta_endpoint, ta_key=ta_key
+    )
     nlp_configuration = {
         "nlp_engine_name": "spacy",
         "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
     nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    registry.add_recognizer(azure_ai_language_recognizer)
     registry.remove_recognizer("SpacyRecognizer")
     return nlp_engine, registry

presidio_streamlit.py CHANGED Viewed

@@ -56,7 +56,8 @@ model_list = [
     "flair/ner-english-large",
     "HuggingFace/obi/deid_roberta_i2b2",
     "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
-    "Azure Text Analytics PII",
     "Other",
 ]
 if not allow_other_models:
@@ -75,22 +76,22 @@ st_model_package = st_model.split("/")[0]
 # Remove package prefix (if needed)
 st_model = (
     st_model
-    if st_model_package not in ("spaCy", "HuggingFace")
     else "/".join(st_model.split("/")[1:])
 )
 if st_model == "Other":
     st_model_package = st.sidebar.selectbox(
-        "NER model OSS package", options=["spaCy", "Flair", "HuggingFace"]
     )
     st_model = st.sidebar.text_input(f"NER model name", value="")
-if st_model == "Azure Text Analytics PII":
     st_ta_key = st.sidebar.text_input(
-        f"Text Analytics key", value=os.getenv("TA_KEY", ""), type="password"
     )
     st_ta_endpoint = st.sidebar.text_input(
-        f"Text Analytics endpoint",
         value=os.getenv("TA_ENDPOINT", default=""),
         help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview",  # noqa: E501
     )
@@ -124,16 +125,10 @@ open_ai_params = None
 logger.debug(f"st_operator: {st_operator}")
-if st_operator == "mask":
-    st_number_of_chars = st.sidebar.number_input(
-        "number of chars", value=st_number_of_chars, min_value=0, max_value=100
-    )
-    st_mask_char = st.sidebar.text_input(
-        "Mask character", value=st_mask_char, max_chars=1
-    )
-elif st_operator == "encrypt":
-    st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
-elif st_operator == "synthesize":
     if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
         openai_api_type = "azure"
         st_openai_api_base = st.sidebar.text_input(
@@ -161,6 +156,34 @@ elif st_operator == "synthesize":
         value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
         help="See more here: https://platform.openai.com/docs/models/",
     )
     open_ai_params = OpenAIParams(
         openai_key=st_openai_key,
@@ -214,7 +237,8 @@ with st.expander("About this demo", expanded=False):
         \n\n[Code](https://aka.ms/presidio) |
         [Tutorial](https://microsoft.github.io/presidio/tutorial/) |
         [Installation](https://microsoft.github.io/presidio/installation/) |
-        [FAQ](https://microsoft.github.io/presidio/faq/) |"""
     )
     st.info(

     "flair/ner-english-large",
     "HuggingFace/obi/deid_roberta_i2b2",
     "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
+    "stanza/en",
+    "Azure AI Language",
     "Other",
 ]
 if not allow_other_models:
 # Remove package prefix (if needed)
 st_model = (
     st_model
+    if st_model_package.lower() not in ("spacy", "stanza", "huggingface")
     else "/".join(st_model.split("/")[1:])
 )
 if st_model == "Other":
     st_model_package = st.sidebar.selectbox(
+        "NER model OSS package", options=["spaCy", "stanza", "Flair", "HuggingFace"]
     )
     st_model = st.sidebar.text_input(f"NER model name", value="")
+if st_model == "Azure AI Language":
     st_ta_key = st.sidebar.text_input(
+        f"Azure AI Language key", value=os.getenv("TA_KEY", ""), type="password"
     )
     st_ta_endpoint = st.sidebar.text_input(
+        f"Azure AI Language endpoint",
         value=os.getenv("TA_ENDPOINT", default=""),
         help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview",  # noqa: E501
     )
 logger.debug(f"st_operator: {st_operator}")
+def set_up_openai_synthesis():
+    """Set up the OpenAI API key and model for text synthesis."""
     if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
         openai_api_type = "azure"
         st_openai_api_base = st.sidebar.text_input(
         value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
         help="See more here: https://platform.openai.com/docs/models/",
     )
+    return (
+        openai_api_type,
+        st_openai_api_base,
+        st_deployment_name,
+        st_openai_version,
+        st_openai_key,
+        st_openai_model,
+    )
+if st_operator == "mask":
+    st_number_of_chars = st.sidebar.number_input(
+        "number of chars", value=st_number_of_chars, min_value=0, max_value=100
+    )
+    st_mask_char = st.sidebar.text_input(
+        "Mask character", value=st_mask_char, max_chars=1
+    )
+elif st_operator == "encrypt":
+    st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
+elif st_operator == "synthesize":
+    (
+        openai_api_type,
+        st_openai_api_base,
+        st_deployment_name,
+        st_openai_version,
+        st_openai_key,
+        st_openai_model,
+    ) = set_up_openai_synthesis()
     open_ai_params = OpenAIParams(
         openai_key=st_openai_key,
         \n\n[Code](https://aka.ms/presidio) |
         [Tutorial](https://microsoft.github.io/presidio/tutorial/) |
         [Installation](https://microsoft.github.io/presidio/installation/) |
+        [FAQ](https://microsoft.github.io/presidio/faq/) |
+        [Feedback](https://forms.office.com/r/9ufyYjfDaY) |"""
     )
     st.info(

test_streamlit.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from presidio_helpers import analyzer_engine, analyze, anonymize
+def test_streamlit_logic():
+    st_model = "en"  # st_model = "StanfordAIMI/stanford-deidentifier-base"
+    st_model_package = "stanza"  ##st_model_package = "HuggingFace"
+    st_ta_key = None
+    st_ta_endpoint = None
+    analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
+    # Read default text
+    with open("demo_text.txt") as f:
+        demo_text = f.readlines()
+    st_text = "".join(demo_text)
+    # instantiate and cache AnalyzerEngine
+    analyzer_engine(*analyzer_params)
+    # Analyze
+    st_analyze_results = analyze(
+        *analyzer_params,
+        text=st_text,
+        entities="All",
+        language="en",
+        score_threshold=0.35,
+        return_decision_process=True,
+        allow_list=[],
+        deny_list=[],
+    )
+    # Anonymize
+    st_anonymize_results = anonymize(
+        text=st_text,
+        operator="replace",
+        mask_char=None,
+        number_of_chars=None,
+        encrypt_key=None,
+        analyze_results=st_analyze_results,
+    )
+    assert st_anonymize_results.text != ""