from transformers import pipeline from presidio_analyzer import ( RecognizerResult, EntityRecognizer, AnalysisExplanation, ) from presidio_analyzer.nlp_engine import NlpEngine, SpacyNlpEngine, NlpArtifacts class TransformerRecognizer(EntityRecognizer): def __init__( self, model_id_or_path, mapping_labels, aggregation_strategy="simple", supported_language="fr", ignore_labels=["O", "MISC"], ): # inits transformers pipeline for given mode or path self.pipeline = pipeline( "token-classification", model=model_id_or_path, aggregation_strategy=aggregation_strategy, ignore_labels=ignore_labels ) # map labels to presidio labels self.label2presidio = mapping_labels # passes entities from model into parent class super().__init__(supported_entities=list(self.label2presidio.values()), supported_language=supported_language) def load(self) -> None: """No loading is required.""" pass def analyze( self, text: str, entities = None, nlp_artifacts: NlpArtifacts = None ): """ Extracts entities using Transformers pipeline """ results = [] predicted_entities = self.pipeline(text) if len(predicted_entities) > 0: for e in predicted_entities: if(e['entity_group'] not in self.label2presidio): continue converted_entity = self.label2presidio[e["entity_group"]] if converted_entity in entities or entities is None: results.append( RecognizerResult( entity_type=converted_entity, start=e["start"], end=e["end"], score=e["score"] ) ) return results