Anonymizer_demo / presidio_nlp_engine_config.py
Farnazgh's picture
add new transformers model for french + update entities
79d722e
from typing import Tuple
import logging
import spacy
from presidio_analyzer import RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
from transformers_class import TransformerRecognizer
logger = logging.getLogger("presidio-streamlit")
def create_nlp_engine_with_spacy(
model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
"""
Instantiate an NlpEngine with a spaCy model
:param model_path: spaCy model path.
"""
if not spacy.util.is_package(model_path):
spacy.cli.download(model_path)
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": model_path.split('_')[0], "model_name": model_path}],
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
registry = RecognizerRegistry()
# registry.load_predefined_recognizers()
registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=["fr", "en"])
registry.add_recognizers_from_yaml("recognizers.yaml")
return nlp_engine, registry
def create_nlp_engine_with_transformers(
model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
"""
Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
The TransformersRecognizer would return results from Transformers models, the spaCy model
would return NlpArtifacts such as POS and lemmas.
:param model_path: HuggingFace model path.
"""
# if not spacy.util.is_package("en_core_web_sm"):
# spacy.cli.download("en_core_web_sm")
# # Using a small spaCy model + a HF NER model
# transformers_recognizer = TransformersRecognizer(model_path=model_path)
#
# if model_path == "StanfordAIMI/stanford-deidentifier-base":
# transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
# elif model_path == "obi/deid_roberta_i2b2":
# transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
# else:
# print(f"Warning: Model has no configuration, loading default.")
# transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
# Use small spaCy model, no need for both spacy and HF models
# The transformers model is used here as a recognizer, not as an NlpEngine
if not spacy.util.is_package(model_path):
spacy.cli.download(model_path)
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": model_path.split('_')[0], "model_name": model_path}],
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
registry = RecognizerRegistry()
registry = load_predefined_recognizers(registry)
mapping_labels = {"PER": "PERSON", 'LOC': 'LOCATION'}
model_name = "AliaeAI/camembert_anonymizer_production_v2" # "Jean-Baptiste/camembert-ner" , "AliaeAI/camembert_anonymizer_production"
transformers_recognizer = TransformerRecognizer(model_name, mapping_labels)
registry.add_recognizer(transformers_recognizer)
registry.remove_recognizer("SpacyRecognizer")
return nlp_engine, registry
from presidio_analyzer.predefined_recognizers import PhoneRecognizer, EmailRecognizer, CreditCardRecognizer, CryptoRecognizer, DateRecognizer, IpRecognizer, IbanRecognizer, UrlRecognizer
import phonenumbers
def load_predefined_recognizers(registry, lang='fr'):
# phone number
phone_recognizer_fr = PhoneRecognizer(supported_language=lang, supported_regions=phonenumbers.SUPPORTED_REGIONS,context=['téléphone'])
registry.add_recognizer(phone_recognizer_fr)
# email
email_recognizer_fr = EmailRecognizer(supported_language=lang, context=["email", "mail", "e-mail"])
registry.add_recognizer(email_recognizer_fr)
# credit card
creditcard_recognizer_fr = CreditCardRecognizer(supported_language=lang,context=["crédit", "carte", "carte de crédit"])
registry.add_recognizer(creditcard_recognizer_fr)
# crypto
crypto_recognizer_fr = CryptoRecognizer(supported_language=lang, context=["crypto"])
registry.add_recognizer(crypto_recognizer_fr)
# date time
date_recognizer_fr = DateRecognizer(supported_language=lang, context=["mois", "date", "jour", "année"])
registry.add_recognizer(date_recognizer_fr)
# ip address
ip_recognizer_fr = IpRecognizer(supported_language=lang, context=["IP", "ip"])
registry.add_recognizer(ip_recognizer_fr)
# iban
iban_recognizer_fr = IbanRecognizer(supported_language=lang, context = ["IBAN", "iban", "bancaire", "compte"])
registry.add_recognizer(iban_recognizer_fr)
# URL
url_recognizer_fr = UrlRecognizer(supported_language=lang, context = ["site", "web"])
registry.add_recognizer(url_recognizer_fr)
# load from yaml
registry.add_recognizers_from_yaml("recognizers.yaml")
return registry