langdonholmes commited on
Commit
9637704
1 Parent(s): 0e58a18

custom anonymizer as class

Browse files
.gitignore CHANGED
@@ -1,3 +1,2 @@
1
- __pycache__/*
2
  .ipynb_checkpoints
3
  __pycache__
 
 
1
  .ipynb_checkpoints
2
  __pycache__
app.py CHANGED
@@ -9,8 +9,8 @@ import pandas as pd
9
  import streamlit as st
10
  from annotated_text import annotated_text
11
 
12
- from piilo.engines.analyzer import prepare_analyzer
13
- from piilo.engines.anonymizer import surrogate_anonymizer
14
 
15
  os.environ['TOKENIZERS_PARALLELISM'] = 'false'
16
  warnings.filterwarnings('ignore')
@@ -26,14 +26,12 @@ def analyzer_engine():
26
  {'lang_code': 'en', 'model_name': 'en_student_name_detector'}],
27
  }
28
 
29
- analyzer = prepare_analyzer(configuration)
30
-
31
- return analyzer
32
 
33
  @st.cache(allow_output_mutation=True)
34
  def anonymizer_engine():
35
  '''Return generate surrogate anonymizer.'''
36
- return surrogate_anonymizer()
37
 
38
  def annotate(text, st_analyze_results, st_entities):
39
  tokens = []
 
9
  import streamlit as st
10
  from annotated_text import annotated_text
11
 
12
+ from piilo.engines.analyzer import CustomAnalyzer
13
+ from piilo.engines.anonymizer import SurrogateAnonymizer
14
 
15
  os.environ['TOKENIZERS_PARALLELISM'] = 'false'
16
  warnings.filterwarnings('ignore')
 
26
  {'lang_code': 'en', 'model_name': 'en_student_name_detector'}],
27
  }
28
 
29
+ return CustomAnalyzer(configuration=configuration)
 
 
30
 
31
  @st.cache(allow_output_mutation=True)
32
  def anonymizer_engine():
33
  '''Return generate surrogate anonymizer.'''
34
+ return SurrogateAnonymizer()
35
 
36
  def annotate(text, st_analyze_results, st_entities):
37
  tokens = []
piilo/engines/analyzer.py CHANGED
@@ -117,25 +117,27 @@ class CustomSpacyRecognizer(LocalRecognizer):
117
  [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
118
  )
119
 
120
- def prepare_analyzer(configuration):
121
- '''Handle Preparation of Analyzer Engine for Presidio.'''
122
-
123
- spacy_recognizer = CustomSpacyRecognizer()
124
-
125
- # Create NLP engine based on configuration
126
- provider = NlpEngineProvider(nlp_configuration=configuration)
127
- nlp_engine = provider.create_engine()
128
-
129
- # add rule-based recognizers
130
- registry = RecognizerRegistry()
131
- registry.load_predefined_recognizers(nlp_engine=nlp_engine)
132
- registry.add_recognizer(spacy_recognizer)
133
 
134
- # remove the nlp engine we passed, to use custom label mappings
135
- registry.remove_recognizer('SpacyRecognizer')
 
 
136
 
137
- analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
138
- registry=registry,
139
- supported_languages=['en'])
140
 
141
- return analyzer
 
 
 
 
 
117
  [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
118
  )
119
 
120
+ class CustomAnalyzer(AnalyzerEngine):
121
+ '''Custom Analyzer Engine for Presidio.'''
122
+
123
+ def __init__(self, configuration):
124
+
125
+ spacy_recognizer = CustomSpacyRecognizer()
126
+
127
+ # Create NLP engine based on configuration
128
+ provider = NlpEngineProvider(nlp_configuration=configuration)
129
+ nlp_engine = provider.create_engine()
 
 
 
130
 
131
+ # add rule-based recognizers
132
+ registry = RecognizerRegistry()
133
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
134
+ registry.add_recognizer(spacy_recognizer)
135
 
136
+ # remove the nlp engine we passed, to use custom label mappings
137
+ registry.remove_recognizer('SpacyRecognizer')
 
138
 
139
+ super().__init__(
140
+ nlp_engine=nlp_engine,
141
+ registry=registry,
142
+ supported_languages=['en']
143
+ )
piilo/engines/anonymizer.py CHANGED
@@ -14,6 +14,7 @@ data = Path(__file__).parent.parent.parent / 'data'
14
  name_table = data / 'ascii_names.parquet'
15
 
16
  logger = logging.getLogger('anonymizer')
 
17
  class NameDatabase(NameDataset):
18
  '''A wrapper around the names_dataset.NameDataset class.
19
  '''
@@ -45,7 +46,7 @@ class NameDatabase(NameDataset):
45
  country = NameWrapper(self.search(last_names)).country
46
  return country if country else None
47
 
48
- class surrogate_anonymizer(AnonymizerEngine):
49
  '''A wrapper around the presidio_anonymizer.AnonymizerEngine class.
50
  '''
51
 
 
14
  name_table = data / 'ascii_names.parquet'
15
 
16
  logger = logging.getLogger('anonymizer')
17
+
18
  class NameDatabase(NameDataset):
19
  '''A wrapper around the names_dataset.NameDataset class.
20
  '''
 
46
  country = NameWrapper(self.search(last_names)).country
47
  return country if country else None
48
 
49
+ class SurrogateAnonymizer(AnonymizerEngine):
50
  '''A wrapper around the presidio_anonymizer.AnonymizerEngine class.
51
  '''
52
 
piilo/main.py CHANGED
@@ -5,8 +5,8 @@ import logging
5
  from fastapi import FastAPI
6
  from fastapi.middleware.cors import CORSMiddleware
7
 
8
- from engines.analyzer import prepare_analyzer
9
- from engines.anonymizer import surrogate_anonymizer
10
  from models.anonymize import AnonymizeRequest, AnonymizeResponse
11
 
12
  configuration = {
@@ -19,8 +19,8 @@ logger = logging.getLogger('api')
19
  logging.basicConfig(level=logging.INFO)
20
 
21
  logger.info("Loading Custom Presidio Analyzer and Anonymizer...")
22
- analyzer = prepare_analyzer(configuration)
23
- anonymizer = surrogate_anonymizer()
24
  logger.info("Loading Successful!")
25
 
26
  app = FastAPI()
 
5
  from fastapi import FastAPI
6
  from fastapi.middleware.cors import CORSMiddleware
7
 
8
+ from engines.analyzer import CustomAnalyzer
9
+ from engines.anonymizer import SurrogateAnonymizer
10
  from models.anonymize import AnonymizeRequest, AnonymizeResponse
11
 
12
  configuration = {
 
19
  logging.basicConfig(level=logging.INFO)
20
 
21
  logger.info("Loading Custom Presidio Analyzer and Anonymizer...")
22
+ analyzer = CustomAnalyzer(configuration)
23
+ anonymizer = SurrogateAnonymizer()
24
  logger.info("Loading Successful!")
25
 
26
  app = FastAPI()