Anonymizer_demo / transformers_class.py
Farnazgh's picture
add new transformers model for french + update entities
79d722e
from transformers import pipeline
from presidio_analyzer import (
RecognizerResult,
EntityRecognizer,
AnalysisExplanation,
)
from presidio_analyzer.nlp_engine import NlpEngine, SpacyNlpEngine, NlpArtifacts
class TransformerRecognizer(EntityRecognizer):
def __init__(
self,
model_id_or_path,
mapping_labels,
aggregation_strategy="simple",
supported_language="fr",
ignore_labels=["O", "MISC"],
):
# inits transformers pipeline for given mode or path
self.pipeline = pipeline(
"token-classification", model=model_id_or_path, aggregation_strategy=aggregation_strategy, ignore_labels=ignore_labels
)
# map labels to presidio labels
self.label2presidio = mapping_labels
# passes entities from model into parent class
super().__init__(supported_entities=list(self.label2presidio.values()), supported_language=supported_language)
def load(self) -> None:
"""No loading is required."""
pass
def analyze(
self, text: str, entities = None, nlp_artifacts: NlpArtifacts = None
):
"""
Extracts entities using Transformers pipeline
"""
results = []
predicted_entities = self.pipeline(text)
if len(predicted_entities) > 0:
for e in predicted_entities:
if(e['entity_group'] not in self.label2presidio):
continue
converted_entity = self.label2presidio[e["entity_group"]]
if converted_entity in entities or entities is None:
results.append(
RecognizerResult(
entity_type=converted_entity, start=e["start"], end=e["end"], score=e["score"]
)
)
return results