langdonholmes commited on
Commit
0c03168
β€’
1 Parent(s): 22bf201

Use nameparser

Browse files
Pipfile CHANGED
@@ -18,6 +18,7 @@ names-dataset = "*"
18
  fastapi = "*"
19
  httpx = "*"
20
  uvicorn = "*"
 
21
 
22
  [dev-packages]
23
 
 
18
  fastapi = "*"
19
  httpx = "*"
20
  uvicorn = "*"
21
+ nameparser = "*"
22
 
23
  [dev-packages]
24
 
Pipfile.lock CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "_meta": {
3
  "hash": {
4
- "sha256": "9cf8ce38b07b8e9be412869628fa94aa5e8444cfda715ed26c2dc73d547e2d9a"
5
  },
6
  "pipfile-spec": 6,
7
  "requires": {
@@ -503,6 +503,14 @@
503
  "markers": "python_version >= '3.6'",
504
  "version": "==1.0.9"
505
  },
 
 
 
 
 
 
 
 
506
  "names-dataset": {
507
  "hashes": [
508
  "sha256:69eea12c9d97e1ae32b6db955bb9b39f7816eb2727d3c6abc726cb475ad160ac"
 
1
  {
2
  "_meta": {
3
  "hash": {
4
+ "sha256": "256f4e2bde3a1d4e60d68e505515f71f0c4e11292e4c7d9d826e08aac629579b"
5
  },
6
  "pipfile-spec": 6,
7
  "requires": {
 
503
  "markers": "python_version >= '3.6'",
504
  "version": "==1.0.9"
505
  },
506
+ "nameparser": {
507
+ "hashes": [
508
+ "sha256:ea2e01d1d9d04c0648be230f161f27316a1b5be431a1cc64e8799fac548fb3bc",
509
+ "sha256:f4b6c7c1048d528bd6aa2b27cf42a06447d2b31e45a95b20449513078f1d86ef"
510
+ ],
511
+ "index": "pypi",
512
+ "version": "==1.1.2"
513
+ },
514
  "names-dataset": {
515
  "hashes": [
516
  "sha256:69eea12c9d97e1ae32b6db955bb9b39f7816eb2727d3c6abc726cb475ad160ac"
anonymizer.py DELETED
@@ -1,153 +0,0 @@
1
- import logging
2
- from pathlib import Path
3
- from typing import List, Optional, Tuple
4
-
5
- import pandas as pd
6
- from presidio_analyzer import RecognizerResult
7
- from presidio_anonymizer import AnonymizerEngine
8
- from presidio_anonymizer.entities import OperatorConfig
9
- from presidio_anonymizer.operators import OperatorType
10
-
11
- from names_database import NameDatabase
12
-
13
- name_table = Path('data', 'ascii_names.parquet')
14
-
15
- logger = logging.getLogger('anonymizer')
16
-
17
- class surrogate_anonymizer(AnonymizerEngine):
18
- def __init__(self):
19
- super().__init__()
20
- self.names_db = NameDatabase()
21
- self.names_df = pd.read_parquet(name_table)
22
-
23
- # keep track of names we have seen
24
- self.seen_names = dict()
25
-
26
- def get_random_name(
27
- self,
28
- country: Optional[str] = None,
29
- gender: Optional[str] = None
30
- ) -> pd.DataFrame:
31
- '''Returns two random names from the database as a DataFrame.
32
- Both rows match gender and country, if provided.
33
- :country: ISO country code e.g. "CO" for Columbia
34
- :gender: 'M' or 'F'
35
- returns two rows of the names dataframe
36
- '''
37
- names_view = self.names_df
38
- if country:
39
- names_view = names_view[names_view['country'] == country]
40
- if gender:
41
- names_view = names_view[names_view['gender'] == gender]
42
- if names_view.size < 25:
43
- return self.names_df.sample(n=2, weights=self.names_df['count'])
44
- return names_view.sample(n=2, weights=names_view['count'])
45
-
46
- def split_name(self, original_name: str) -> Tuple[str]:
47
- '''Splits name into parts.
48
- If one token, assume it is a first name.
49
- If two tokens, first and last name.
50
- If three tokens, one first name and two last names.
51
- If four tokens, two first names and two last names.'''
52
- names = original_name.split()
53
- if len(names) == 1:
54
- logger.info(f'Splitting to 1 first name: {names}')
55
- return names[0], None
56
- elif len(names) == 2:
57
- logger.info(f'Splitting to 1 first name, 1 last name: {names}')
58
- return names[0], names[1]
59
- elif len(names) == 3:
60
- logger.info(f'Splitting to 1 first name, 2 last names: {names}')
61
- return names[0], ' '.join(names[1:])
62
- elif len(names) == 4:
63
- logger.info(f'Splitting to 2 first names and 2 last names: {names}')
64
- return ' '.join(names[:2]), ' '.join(names[2:])
65
- else:
66
- logger.info(f'Splitting failed, do not match gender/country: {names}')
67
- return None, None
68
-
69
- def generate_surrogate(self, original_name: str) -> str:
70
- '''Generate a surrogate name.
71
- '''
72
- if original_name == 'PII':
73
- # Every time we call this function, Presidio will validate it
74
- # by testing that the function returns a str when the input is
75
- # 'PII'. Bypass this test.
76
- return 'PII'
77
-
78
- # If we have seen this name before, return the same surrogate
79
- if original_name in self.seen_names:
80
- return self.seen_names[original_name]
81
-
82
- first_names, last_names = self.split_name(original_name)
83
- gender = self.names_db.get_gender(first_names) if first_names else None
84
- logger.debug(f'Gender set to {gender}')
85
- country = self.names_db.get_country(last_names) if last_names else None
86
- logger.debug(f'Country set to {country}')
87
-
88
- surrogate_name = ''
89
-
90
- name_candidates = self.get_random_name(gender=gender, country=country)
91
-
92
- surrogate_name += name_candidates.iloc[0]['first']
93
- logger.info(f'First name surrogate is {surrogate_name}')
94
-
95
- if last_names:
96
- logger.info(f'Combining with {name_candidates.iloc[1]["last"]}')
97
- surrogate_name += ' ' + name_candidates.iloc[1]['last']
98
-
99
- logger.info(f'Returning surrogate name {surrogate_name}')
100
-
101
- self.seen_names[original_name] = surrogate_name
102
-
103
- return surrogate_name
104
-
105
- def anonymize(
106
- self,
107
- text: str,
108
- analyzer_results: List[RecognizerResult]
109
- ):
110
- '''Anonymize identified input using Presidio Anonymizer.'''
111
-
112
- if not text:
113
- return
114
-
115
- analyzer_results = self._remove_conflicts_and_get_text_manipulation_data(
116
- analyzer_results
117
- )
118
-
119
- operators = self._AnonymizerEngine__check_or_add_default_operator(
120
- {
121
- 'STUDENT': OperatorConfig('custom',
122
- {'lambda': self.generate_surrogate}),
123
- 'EMAIL_ADDRESS': OperatorConfig('replace',
124
- {'new_value': 'janedoe@aol.com'}),
125
- 'PHONE_NUMBER': OperatorConfig('replace',
126
- {'new_value': '888-888-8888'}),
127
- 'URL': OperatorConfig('replace',
128
- {'new_value': 'aol.com'}),
129
- }
130
- )
131
-
132
- res = self._operate(text,
133
- analyzer_results,
134
- operators,
135
- OperatorType.Anonymize)
136
-
137
- return res.text
138
-
139
- if __name__ == '__main__':
140
- logging.basicConfig(level=logging.DEBUG)
141
-
142
- anonymizer = surrogate_anonymizer()
143
-
144
- test_names = ['Nora Wang',
145
- 'MJ',
146
- '',
147
- '(',
148
- 'Mario Escobar Sanchez',
149
- 'Jane Fonda Michelle Rousseau',
150
- 'Sir Phillipe Ricardo de la Sota Mayor']
151
-
152
- for name in test_names:
153
- anonymizer.generate_surrogate(name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
piilo/__init__.py ADDED
File without changes
analyzer.py β†’ piilo/analyzer.py RENAMED
File without changes
main.py β†’ piilo/main.py RENAMED
File without changes
{models β†’ piilo/models}/anonymize.py RENAMED
File without changes
names_database.py β†’ piilo/names_database.py RENAMED
File without changes
test_main.py β†’ piilo/test_main.py RENAMED
File without changes