File size: 4,833 Bytes
28a039d 7172378 28a039d 7172378 28a039d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
from typing import Tuple
import logging
import spacy
from presidio_analyzer import RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
logger = logging.getLogger("presidio-streamlit")
def create_nlp_engine_with_spacy(
model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
"""
Instantiate an NlpEngine with a spaCy model
:param model_path: spaCy model path.
"""
registry = RecognizerRegistry()
registry.load_predefined_recognizers()
if not spacy.util.is_package(model_path):
spacy.cli.download(model_path)
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": model_path}],
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
return nlp_engine, registry
def create_nlp_engine_with_transformers(
model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
"""
Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
The TransformersRecognizer would return results from Transformers models, the spaCy model
would return NlpArtifacts such as POS and lemmas.
:param model_path: HuggingFace model path.
"""
from transformers_rec import (
STANFORD_COFIGURATION,
BERT_DEID_CONFIGURATION,
TransformersRecognizer,
)
registry = RecognizerRegistry()
registry.load_predefined_recognizers()
if not spacy.util.is_package("en_core_web_sm"):
spacy.cli.download("en_core_web_sm")
# Using a small spaCy model + a HF NER model
transformers_recognizer = TransformersRecognizer(model_path=model_path)
if model_path == "StanfordAIMI/stanford-deidentifier-base":
transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
elif model_path == "obi/deid_roberta_i2b2":
transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
else:
print(f"Warning: Model has no configuration, loading default.")
transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
# Use small spaCy model, no need for both spacy and HF models
# The transformers model is used here as a recognizer, not as an NlpEngine
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
}
registry.add_recognizer(transformers_recognizer)
registry.remove_recognizer("SpacyRecognizer")
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
return nlp_engine, registry
def create_nlp_engine_with_flair(
model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
"""
Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model.
The FlairRecognizer would return results from Flair models, the spaCy model
would return NlpArtifacts such as POS and lemmas.
:param model_path: Flair model path.
"""
from flair_recognizer import FlairRecognizer
registry = RecognizerRegistry()
registry.load_predefined_recognizers()
if not spacy.util.is_package("en_core_web_sm"):
spacy.cli.download("en_core_web_sm")
# Using a small spaCy model + a Flair NER model
flair_recognizer = FlairRecognizer(model_path=model_path)
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
}
registry.add_recognizer(flair_recognizer)
registry.remove_recognizer("SpacyRecognizer")
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
return nlp_engine, registry
def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
"""
Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
would return NlpArtifacts such as POS and lemmas.
:param ta_key: Azure Text Analytics key.
:param ta_endpoint: Azure Text Analytics endpoint.
"""
from text_analytics_wrapper import TextAnalyticsWrapper
if not ta_key or not ta_endpoint:
raise RuntimeError("Please fill in the Text Analytics endpoint details")
registry = RecognizerRegistry()
registry.load_predefined_recognizers()
ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key)
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
registry.add_recognizer(ta_recognizer)
registry.remove_recognizer("SpacyRecognizer")
return nlp_engine, registry
|