yuragoithf commited on
Commit
d8c2d8e
1 Parent(s): bc22eb1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -0
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ import gradio as gr
4
+
5
+ from presidio_anonymizer import AnonymizerEngine
6
+ from presidio_analyzer import AnalyzerEngine
7
+ from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
8
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
9
+
10
+
11
+ # Initialize the engine:
12
+ analyzer = AnalyzerEngine()
13
+ anonymizer = AnonymizerEngine()
14
+
15
+ # Create pipeline
16
+ tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased")
17
+ tokenizer.add_tokens('<person>')
18
+ model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased")
19
+ pipe = pipeline(model=model, tokenizer=tokenizer, task='ner')
20
+
21
+ # https://microsoft.github.io/presidio/supported_entities/
22
+ ENT_TYPES = [
23
+ # 'PERSON',
24
+ 'CREDIT_CARD',
25
+ 'EMAIL_ADDRESS',
26
+ 'IP_ADDRESS',
27
+ 'PHONE_NUMBER'
28
+ ]
29
+
30
+ def mask_names_hf(text):
31
+ # Tokenize inputs
32
+ inputs = tokenizer(text, return_tensors='pt', truncation=True)
33
+ tokens = inputs.tokens()
34
+
35
+ # Make inferences
36
+ outputs = model(**inputs).logits
37
+ predictions = torch.argmax(outputs, dim=2)
38
+
39
+ # Replace tokens that are people with <PERSON>
40
+ words = []
41
+ for token, prediction in zip(tokens, predictions[0].numpy()):
42
+ prediction = model.config.id2label[prediction]
43
+ if prediction not in ('I-PER', 'B-PER'):
44
+ words.append(token)
45
+ elif prediction == 'B-PER':
46
+ if words[-1] != '<PERSON>':
47
+ words.append('<PERSON>')
48
+ else:
49
+ pass
50
+ # Convert those tokens to a string
51
+ return tokenizer.convert_tokens_to_string(words[1:-1])
52
+
53
+ def anonymize(text, min_len=3):
54
+
55
+ # Find and replace other stuff (Presidio NER)
56
+ ents = analyzer.analyze(text, language='en', entities=ENT_TYPES)
57
+ results = anonymizer.anonymize(text, analyzer_results=ents)
58
+ t = results.text
59
+
60
+ # Find and replace names (HF NER)
61
+ t = mask_names_hf(t)
62
+
63
+ pats = re.findall('<.+?>', t)
64
+ for p in pats:
65
+ t = t.replace(p, p.upper().replace(' ', ''))
66
+
67
+
68
+ t = t.replace('<PERSON><PERSON>', '<PERSON>')
69
+ return t
70
+
71
+ title = "Personal Info Remover"
72
+ description = """Personal Info Remover"""
73
+
74
+ gr.Interface(
75
+ anonymize,
76
+ inputs='text',
77
+ outputs='text',
78
+ title=title,
79
+ description=description,
80
+ examples=["My name is Yuriy, contacts info: 0-800-123-456, test@i.ua, IP address is 1.0.0.1"]
81
+ ).launch(debug=True)