File size: 2,820 Bytes
c5f3a07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1f2899
c5f3a07
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# -*- coding: utf-8 -*-
"""pii_redaction_app.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1rE0OCygUnXrD34qaq12QZKp7_ixkYXN2
"""

from spacy.cli import download
download('en_core_web_lg')
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch
import re

import gradio as gr

# Initialize the engine:
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Create the NER pipeline
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased")
tokenizer.add_tokens('<person>')
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased")
pipe = pipeline(model=model, tokenizer=tokenizer, task='ner')

# https://microsoft.github.io/presidio/supported_entities/
ENT_TYPES = [
#     'PERSON',
    'CREDIT_CARD',
    'EMAIL_ADDRESS',
    'IP_ADDRESS',
    'PHONE_NUMBER'
]

def mask_names_hf(text):
    # Tokenize inputs
    inputs = tokenizer(text, return_tensors='pt', truncation=True)
    tokens = inputs.tokens()
    
    # Make inferences
    outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    
    # Replace tokens that are people with <PERSON>
    words = []
    for token, prediction in zip(tokens, predictions[0].numpy()):
        prediction = model.config.id2label[prediction]
        if prediction not in ('I-PER', 'B-PER'):
            words.append(token)
        elif prediction == 'B-PER':
            if words[-1] != '<PERSON>':
                words.append('<PERSON>')
        else:
            pass
    # Convert those tokens to a string
    return tokenizer.convert_tokens_to_string(words[1:-1])

# def mask_names_hf(text):
#     outputs = pipe(text)
#     tokens = []
#     for token in outputs:
#         if 'PER' in token['entity']:
#             if tokens[-1] != '<PERSON>':
#                 tokens.append('<PERSON>')
#         else:
#             tokens.append(token['word'])

#     t = tokenizer.convert_tokens_to_string(tokens)
#     return t

def anonymize(text, min_len=3):
    
    # Find and replace other stuff (Presidio NER)
    ents = analyzer.analyze(text, language='en', entities=ENT_TYPES)
    results = anonymizer.anonymize(text, analyzer_results=ents)
    t = results.text
    
#     t = copy(text)
    # Find and replace names (HF NER)
    t = mask_names_hf(t)
    
    pats = re.findall('<.+?>', t)
    for p in pats:
        t = t.replace(p, p.upper().replace(' ', ''))
        
    
    t = t.replace('<PERSON><PERSON>', '<PERSON>')
    return t


gr.Interface(anonymize, inputs='text', outputs='text').launch(debug=True)