from typing import Tuple import logging import spacy from presidio_analyzer import RecognizerRegistry from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider logger = logging.getLogger("presidio-streamlit") def create_nlp_engine_with_spacy( model_path: str, ) -> Tuple[NlpEngine, RecognizerRegistry]: """ Instantiate an NlpEngine with a spaCy model :param model_path: spaCy model path. """ if not spacy.util.is_package(model_path): spacy.cli.download(model_path) nlp_configuration = { "nlp_engine_name": "spacy", "models": [{"lang_code": model_path.split('_')[0], "model_name": model_path}], } nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() registry = RecognizerRegistry() # registry.load_predefined_recognizers() registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=["fr", "en"]) registry.add_recognizers_from_yaml("recognizers.yaml") return nlp_engine, registry # def create_nlp_engine_with_transformers( # model_path: str, # ) -> Tuple[NlpEngine, RecognizerRegistry]: # """ # Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model. # The TransformersRecognizer would return results from Transformers models, the spaCy model # would return NlpArtifacts such as POS and lemmas. # :param model_path: HuggingFace model path. # """ # # from transformers_rec import ( # STANFORD_COFIGURATION, # BERT_DEID_CONFIGURATION, # TransformersRecognizer, # ) # # registry = RecognizerRegistry() # registry.load_predefined_recognizers() # # if not spacy.util.is_package("en_core_web_sm"): # spacy.cli.download("en_core_web_sm") # # Using a small spaCy model + a HF NER model # transformers_recognizer = TransformersRecognizer(model_path=model_path) # # if model_path == "StanfordAIMI/stanford-deidentifier-base": # transformers_recognizer.load_transformer(**STANFORD_COFIGURATION) # elif model_path == "obi/deid_roberta_i2b2": # transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION) # else: # print(f"Warning: Model has no configuration, loading default.") # transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION) # # # Use small spaCy model, no need for both spacy and HF models # # The transformers model is used here as a recognizer, not as an NlpEngine # nlp_configuration = { # "nlp_engine_name": "spacy", # "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}], # } # # registry.add_recognizer(transformers_recognizer) # registry.remove_recognizer("SpacyRecognizer") # # nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() # # return nlp_engine, registry # def create_nlp_engine_with_flair( # model_path: str, # ) -> Tuple[NlpEngine, RecognizerRegistry]: # """ # Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model. # The FlairRecognizer would return results from Flair models, the spaCy model # would return NlpArtifacts such as POS and lemmas. # :param model_path: Flair model path. # """ # from flair_recognizer import FlairRecognizer # # registry = RecognizerRegistry() # registry.load_predefined_recognizers() # # if not spacy.util.is_package("en_core_web_sm"): # spacy.cli.download("en_core_web_sm") # # Using a small spaCy model + a Flair NER model # flair_recognizer = FlairRecognizer(model_path=model_path) # nlp_configuration = { # "nlp_engine_name": "spacy", # "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}], # } # registry.add_recognizer(flair_recognizer) # registry.remove_recognizer("SpacyRecognizer") # # nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() # # return nlp_engine, registry # def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str): # """ # Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model. # The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model # would return NlpArtifacts such as POS and lemmas. # :param ta_key: Azure Text Analytics key. # :param ta_endpoint: Azure Text Analytics endpoint. # """ # from text_analytics_wrapper import TextAnalyticsWrapper # # if not ta_key or not ta_endpoint: # raise RuntimeError("Please fill in the Text Analytics endpoint details") # # registry = RecognizerRegistry() # registry.load_predefined_recognizers() # # ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key) # nlp_configuration = { # "nlp_engine_name": "spacy", # "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}], # } # # nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() # # registry.add_recognizer(ta_recognizer) # registry.remove_recognizer("SpacyRecognizer") # # return nlp_engine, registry