beki commited on
Commit
49bacc7
1 Parent(s): 1f21ea2

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +149 -0
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Streamlit app for Presidio."""
2
+
3
+ import json
4
+ from json import JSONEncoder
5
+
6
+ import pandas as pd
7
+ import streamlit as st
8
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
9
+ from presidio_anonymizer import AnonymizerEngine
10
+
11
+ from transformers_recognizer import TransformersRecognizer
12
+
13
+
14
+ import spacy
15
+ spacy.cli.download("en_core_web_lg")
16
+
17
+
18
+ # Helper methods
19
+ @st.cache(allow_output_mutation=True)
20
+ def analyzer_engine():
21
+ """Return AnalyzerEngine."""
22
+
23
+ transformers_recognizer = TransformersRecognizer()
24
+
25
+ registry = RecognizerRegistry()
26
+ registry.add_recognizer(transformers_recognizer)
27
+ registry.load_predefined_recognizers()
28
+
29
+ analyzer = AnalyzerEngine(registry=registry)
30
+ return analyzer
31
+
32
+
33
+ @st.cache(allow_output_mutation=True)
34
+ def anonymizer_engine():
35
+ """Return AnonymizerEngine."""
36
+ return AnonymizerEngine()
37
+
38
+
39
+ def get_supported_entities():
40
+ """Return supported entities from the Analyzer Engine."""
41
+ return analyzer_engine().get_supported_entities()
42
+
43
+
44
+ def analyze(**kwargs):
45
+ """Analyze input using Analyzer engine and input arguments (kwargs)."""
46
+ if "entities" not in kwargs or "All" in kwargs["entities"]:
47
+ kwargs["entities"] = None
48
+ return analyzer_engine().analyze(**kwargs)
49
+
50
+
51
+ def anonymize(text, analyze_results):
52
+ """Anonymize identified input using Presidio Abonymizer."""
53
+
54
+ res = anonymizer_engine().anonymize(text, analyze_results)
55
+ return res.text
56
+
57
+
58
+ st.set_page_config(page_title="Presidio demo (English)", layout="wide")
59
+
60
+ # Side bar
61
+ st.sidebar.markdown(
62
+ """
63
+ Anonymize PII entities with [presidio](https://aka.ms/presidio), spaCy and a [PHI detection Roberta model](https://huggingface.co/obi/deid_roberta_i2b2).
64
+ """
65
+ )
66
+
67
+ st_entities = st.sidebar.multiselect(
68
+ label="Which entities to look for?",
69
+ options=get_supported_entities(),
70
+ default=list(get_supported_entities()),
71
+ )
72
+
73
+ st_threhsold = st.sidebar.slider(
74
+ label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
75
+ )
76
+
77
+ st_return_decision_process = st.sidebar.checkbox("Add analysis explanations in json")
78
+
79
+ st.sidebar.info(
80
+ "Presidio is an open source framework for PII detection and anonymization. "
81
+ "For more info visit [aka.ms/presidio](https://aka.ms/presidio)"
82
+ )
83
+
84
+
85
+ # Main panel
86
+ analyzer_load_state = st.info("Starting Presidio analyzer...")
87
+ engine = analyzer_engine()
88
+ analyzer_load_state.empty()
89
+
90
+
91
+ # Create two columns for before and after
92
+ col1, col2 = st.columns(2)
93
+
94
+ # Before:
95
+ col1.subheader("Input string:")
96
+ st_text = col1.text_area(
97
+ label="Enter text",
98
+ value="Type in some text, "
99
+ "like a phone number (212-141-4544) "
100
+ "or a name (Lebron James).",
101
+ height=400,
102
+ )
103
+
104
+ # After
105
+ col2.subheader("Output:")
106
+
107
+ st_analyze_results = analyze(
108
+ text=st_text,
109
+ entities=st_entities,
110
+ language="en",
111
+ score_threshold=st_threhsold,
112
+ return_decision_process=st_return_decision_process,
113
+ )
114
+ st_anonymize_results = anonymize(st_text, st_analyze_results)
115
+ col2.text_area(label="", value=st_anonymize_results, height=400)
116
+
117
+
118
+ # table result
119
+ st.subheader("Findings")
120
+ if st_analyze_results:
121
+ df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
122
+ df = df[["entity_type", "start", "end", "score"]].rename(
123
+ {
124
+ "entity_type": "Entity type",
125
+ "start": "Start",
126
+ "end": "End",
127
+ "score": "Confidence",
128
+ },
129
+ axis=1,
130
+ )
131
+
132
+ st.dataframe(df, width=1000)
133
+ else:
134
+ st.text("No findings")
135
+
136
+
137
+ # json result
138
+ class ToDictListEncoder(JSONEncoder):
139
+ """Encode dict to json."""
140
+
141
+ def default(self, o):
142
+ """Encode to JSON using to_dict."""
143
+ if o:
144
+ return o.to_dict()
145
+ return []
146
+
147
+
148
+ if st_return_decision_process:
149
+ st.json(json.dumps(st_analyze_results, cls=ToDictListEncoder))