Spaces:
Sleeping
Sleeping
from transformers import pipeline | |
from presidio_analyzer import ( | |
RecognizerResult, | |
EntityRecognizer, | |
AnalysisExplanation, | |
) | |
from presidio_analyzer.nlp_engine import NlpEngine, SpacyNlpEngine, NlpArtifacts | |
class TransformerRecognizer(EntityRecognizer): | |
def __init__( | |
self, | |
model_id_or_path, | |
mapping_labels, | |
aggregation_strategy="simple", | |
supported_language="fr", | |
ignore_labels=["O", "MISC"], | |
): | |
# inits transformers pipeline for given mode or path | |
self.pipeline = pipeline( | |
"token-classification", model=model_id_or_path, aggregation_strategy=aggregation_strategy, ignore_labels=ignore_labels | |
) | |
# map labels to presidio labels | |
self.label2presidio = mapping_labels | |
# passes entities from model into parent class | |
super().__init__(supported_entities=list(self.label2presidio.values()), supported_language=supported_language) | |
def load(self) -> None: | |
"""No loading is required.""" | |
pass | |
def analyze( | |
self, text: str, entities = None, nlp_artifacts: NlpArtifacts = None | |
): | |
""" | |
Extracts entities using Transformers pipeline | |
""" | |
results = [] | |
predicted_entities = self.pipeline(text) | |
if len(predicted_entities) > 0: | |
for e in predicted_entities: | |
if(e['entity_group'] not in self.label2presidio): | |
continue | |
converted_entity = self.label2presidio[e["entity_group"]] | |
if converted_entity in entities or entities is None: | |
results.append( | |
RecognizerResult( | |
entity_type=converted_entity, start=e["start"], end=e["end"], score=e["score"] | |
) | |
) | |
return results |