import re import torch import gradio as gr from presidio_anonymizer import AnonymizerEngine from presidio_analyzer import AnalyzerEngine from presidio_anonymizer.entities import RecognizerResult, OperatorConfig from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline # Initialize the engine: analyzer = AnalyzerEngine() anonymizer = AnonymizerEngine() # Create pipeline tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased") tokenizer.add_tokens('') model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased") pipe = pipeline(model=model, tokenizer=tokenizer, task='ner') # https://microsoft.github.io/presidio/supported_entities/ ENT_TYPES = [ # 'PERSON', 'CREDIT_CARD', 'EMAIL_ADDRESS', 'IP_ADDRESS', 'PHONE_NUMBER' ] def mask_names_hf(text): # Tokenize inputs inputs = tokenizer(text, return_tensors='pt', truncation=True) tokens = inputs.tokens() # Make inferences outputs = model(**inputs).logits predictions = torch.argmax(outputs, dim=2) # Replace tokens that are people with words = [] for token, prediction in zip(tokens, predictions[0].numpy()): prediction = model.config.id2label[prediction] if prediction not in ('I-PER', 'B-PER'): words.append(token) elif prediction == 'B-PER': if words[-1] != '': words.append('') else: pass # Convert those tokens to a string return tokenizer.convert_tokens_to_string(words[1:-1]) def anonymize(text, min_len=3): # Find and replace other stuff (Presidio NER) ents = analyzer.analyze(text, language='en', entities=ENT_TYPES) results = anonymizer.anonymize(text, analyzer_results=ents) t = results.text # Find and replace names (HF NER) t = mask_names_hf(t) pats = re.findall('<.+?>', t) for p in pats: t = t.replace(p, p.upper().replace(' ', '')) t = t.replace('', '') return t title = "Personal Info Remover" description = """Personal Info Remover""" gr.Interface( anonymize, inputs='text', outputs='text', title=title, description=description, examples=["My name is Yuriy, contacts info: 0-800-123-456, test@i.ua, IP address is 1.0.0.1"] ).launch(debug=True)