# -*- coding: utf-8 -*- """pii_redaction_app.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1rE0OCygUnXrD34qaq12QZKp7_ixkYXN2 """ from spacy.cli import download download('en_core_web_lg') from presidio_anonymizer import AnonymizerEngine from presidio_analyzer import AnalyzerEngine from presidio_anonymizer.entities import RecognizerResult, OperatorConfig from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline import torch import re import gradio as gr # Initialize the engine: analyzer = AnalyzerEngine() anonymizer = AnonymizerEngine() # Create the NER pipeline tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased") tokenizer.add_tokens('') model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased") pipe = pipeline(model=model, tokenizer=tokenizer, task='ner') # https://microsoft.github.io/presidio/supported_entities/ ENT_TYPES = [ # 'PERSON', 'CREDIT_CARD', 'EMAIL_ADDRESS', 'IP_ADDRESS', 'PHONE_NUMBER' ] def mask_names_hf(text): # Tokenize inputs inputs = tokenizer(text, return_tensors='pt', truncation=True) tokens = inputs.tokens() # Make inferences outputs = model(**inputs).logits predictions = torch.argmax(outputs, dim=2) # Replace tokens that are people with words = [] for token, prediction in zip(tokens, predictions[0].numpy()): prediction = model.config.id2label[prediction] if prediction not in ('I-PER', 'B-PER'): words.append(token) elif prediction == 'B-PER': if words[-1] != '': words.append('') else: pass # Convert those tokens to a string return tokenizer.convert_tokens_to_string(words[1:-1]) # def mask_names_hf(text): # outputs = pipe(text) # tokens = [] # for token in outputs: # if 'PER' in token['entity']: # if tokens[-1] != '': # tokens.append('') # else: # tokens.append(token['word']) # t = tokenizer.convert_tokens_to_string(tokens) # return t def anonymize(text, min_len=3): # Find and replace other stuff (Presidio NER) ents = analyzer.analyze(text, language='en', entities=ENT_TYPES) results = anonymizer.anonymize(text, analyzer_results=ents) t = results.text # t = copy(text) # Find and replace names (HF NER) t = mask_names_hf(t) pats = re.findall('<.+?>', t) for p in pats: t = t.replace(p, p.upper().replace(' ', '')) t = t.replace('', '') return t gr.Interface(anonymize, inputs='text', outputs='text').launch()