ziggycross commited on
Commit
003953a
1 Parent(s): a51662f

Improved k-anonymizer.

Browse files
Files changed (3) hide show
  1. app.py +10 -10
  2. loader-cleaner.ipynb +707 -3
  3. modules.py +99 -49
app.py CHANGED
@@ -4,6 +4,7 @@ from streamlit_extras.let_it_rain import rain
4
 
5
  # Options
6
  DISCLAIMER = "*Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam urna sem, bibendum efficitur pellentesque a, sollicitudin pharetra urna. Nam vel lectus vitae elit luctus feugiat a a purus. Aenean mollis quis ipsum sed ornare. Nunc sit amet ultricies tellus. Vivamus vulputate sem id molestie viverra. Etiam egestas lobortis enim, sit amet lobortis ligula sollicitudin vel. Nunc eget ipsum sollicitudin, convallis.*"
 
7
 
8
  # Page Config
9
  st.set_page_config(layout="wide")
@@ -22,19 +23,18 @@ if df is None: # Await file to be uploaded
22
  else:
23
  ### PRE-TRANSFORM features for sidebar
24
  with st.sidebar:
25
- # Options for data cleaning
26
- with st.container() as cleaning_options:
27
- st.markdown("### Data cleaning options:")
28
  remove_duplicates = st.checkbox("Remove duplicate rows", value=True)
29
  drop_missing = st.checkbox("Remove rows with missing values", value=False)
30
 
31
  # Options for data optimization
32
  with st.container() as anonymizing_options:
33
  st.markdown("### Anonymizing options:")
34
- sample_checkbox = st.checkbox("Test checkbox", value=True)
35
- sample_slider = st.slider("Test slider", min_value=1, max_value=10, value=2)
36
- sample_number = st.number_input("Test number", min_value=0, max_value=100, value=50)
37
- sample_dropdown = st.selectbox("Test dropdown", ["A", "B", "C"], index=1)
38
 
39
 
40
  ### DATA PREVIEW AND TRANSFORM
@@ -46,8 +46,7 @@ else:
46
 
47
  # Transform data
48
  df = modules.data_cleaner(df, drop_missing, remove_duplicates)
49
- df = modules.data_anonymizer(df)
50
- # download_file = modules.create_file(df, ".csv")
51
 
52
  # Preview data after before_data
53
  with st.container() as after_data:
@@ -60,8 +59,9 @@ else:
60
  with st.sidebar:
61
  # Options for download
62
  with st.container() as download_header:
63
- st.markdown("### Download")
64
  output_extension = st.selectbox("File type", [".csv", ".json", ".xlsx"])
 
65
 
66
  # Prepare file for download
67
  with st.container() as downloader:
 
4
 
5
  # Options
6
  DISCLAIMER = "*Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam urna sem, bibendum efficitur pellentesque a, sollicitudin pharetra urna. Nam vel lectus vitae elit luctus feugiat a a purus. Aenean mollis quis ipsum sed ornare. Nunc sit amet ultricies tellus. Vivamus vulputate sem id molestie viverra. Etiam egestas lobortis enim, sit amet lobortis ligula sollicitudin vel. Nunc eget ipsum sollicitudin, convallis.*"
7
+ K = 2
8
 
9
  # Page Config
10
  st.set_page_config(layout="wide")
 
23
  else:
24
  ### PRE-TRANSFORM features for sidebar
25
  with st.sidebar:
26
+ # Options for data loading
27
+ with st.container() as loading_options:
28
+ st.markdown("### Data loading options:")
29
  remove_duplicates = st.checkbox("Remove duplicate rows", value=True)
30
  drop_missing = st.checkbox("Remove rows with missing values", value=False)
31
 
32
  # Options for data optimization
33
  with st.container() as anonymizing_options:
34
  st.markdown("### Anonymizing options:")
35
+ max_categorical_size = st.slider("Maximum number of categories", min_value=2, max_value=200, value=50)
36
+ bin_size = st.slider("Target bin size", min_value=2, max_value=200, value=20)
37
+ sensitivity_minimum = st.number_input("Minimum count", min_value=2, max_value=10, value=2)
 
38
 
39
 
40
  ### DATA PREVIEW AND TRANSFORM
 
46
 
47
  # Transform data
48
  df = modules.data_cleaner(df, drop_missing, remove_duplicates)
49
+ df, unprocessed = modules.data_anonymizer(df, K, max_categorical_size, bin_size, sensitivity_minimum)
 
50
 
51
  # Preview data after before_data
52
  with st.container() as after_data:
 
59
  with st.sidebar:
60
  # Options for download
61
  with st.container() as download_header:
62
+ st.markdown("### Download options:")
63
  output_extension = st.selectbox("File type", [".csv", ".json", ".xlsx"])
64
+ if unprocessed: st.markdown(f"Error encountered when processing columns {str(unprocessed)}")
65
 
66
  # Prepare file for download
67
  with st.container() as downloader:
loader-cleaner.ipynb CHANGED
@@ -6,8 +6,10 @@
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
9
- "import pandas as pd\n",
10
- "import os"
 
 
11
  ]
12
  },
13
  {
@@ -79,12 +81,714 @@
79
  "metadata": {},
80
  "outputs": [],
81
  "source": [
82
- "DROP_MISSING = True\n",
83
  "REMOVE_DUPLICATES = True\n",
84
  "\n",
85
  "df = df.dropna(how=\"any\" if DROP_MISSING else \"all\")\n",
86
  "if REMOVE_DUPLICATES: df = df.drop_duplicates()"
87
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  }
89
  ],
90
  "metadata": {
 
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
9
+ "from itertools import combinations\n",
10
+ "import numpy as np\n",
11
+ "import os\n",
12
+ "import pandas as pd"
13
  ]
14
  },
15
  {
 
81
  "metadata": {},
82
  "outputs": [],
83
  "source": [
84
+ "DROP_MISSING = False\n",
85
  "REMOVE_DUPLICATES = True\n",
86
  "\n",
87
  "df = df.dropna(how=\"any\" if DROP_MISSING else \"all\")\n",
88
  "if REMOVE_DUPLICATES: df = df.drop_duplicates()"
89
  ]
90
+ },
91
+ {
92
+ "attachments": {},
93
+ "cell_type": "markdown",
94
+ "metadata": {},
95
+ "source": [
96
+ "### Anonymize data"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 4,
102
+ "metadata": {},
103
+ "outputs": [
104
+ {
105
+ "name": "stderr",
106
+ "output_type": "stream",
107
+ "text": [
108
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
109
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
110
+ "\n",
111
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
112
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
113
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
114
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
115
+ "\n",
116
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
117
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
118
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
119
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
120
+ "\n",
121
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
122
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
123
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
124
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
125
+ "\n",
126
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
127
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
128
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
129
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
130
+ "\n",
131
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
132
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
133
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
134
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
135
+ "\n",
136
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
137
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
138
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
139
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
140
+ "\n",
141
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
142
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
143
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
144
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
145
+ "\n",
146
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
147
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
148
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
149
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
150
+ "\n",
151
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
152
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
153
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
154
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
155
+ "\n",
156
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
157
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
158
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
159
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
160
+ "\n",
161
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
162
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
163
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
164
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
165
+ "\n",
166
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
167
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
168
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
169
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
170
+ "\n",
171
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
172
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
173
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
174
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
175
+ "\n",
176
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
177
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
178
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
179
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
180
+ "\n",
181
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
182
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
183
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
184
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
185
+ "\n",
186
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
187
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
188
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
189
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
190
+ "\n",
191
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
192
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
193
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
194
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
195
+ "\n",
196
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
197
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
198
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
199
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
200
+ "\n",
201
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
202
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
203
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
204
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
205
+ "\n",
206
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
207
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
208
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
209
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
210
+ "\n",
211
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
212
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
213
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
214
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
215
+ "\n",
216
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
217
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
218
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
219
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
220
+ "\n",
221
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
222
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
223
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
224
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
225
+ "\n",
226
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
227
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
228
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
229
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
230
+ "\n",
231
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
232
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
233
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
234
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
235
+ "\n",
236
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
237
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
238
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
239
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
240
+ "\n",
241
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
242
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
243
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
244
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
245
+ "\n",
246
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
247
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
248
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
249
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
250
+ "\n",
251
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
252
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
253
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
254
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
255
+ "\n",
256
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
257
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
258
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
259
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
260
+ "\n",
261
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
262
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
263
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
264
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
265
+ "\n",
266
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
267
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
268
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
269
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
270
+ "\n",
271
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
272
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
273
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
274
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
275
+ "\n",
276
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
277
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
278
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
279
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
280
+ "\n",
281
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
282
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
283
+ "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
284
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
285
+ "\n",
286
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
287
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n"
288
+ ]
289
+ },
290
+ {
291
+ "data": {
292
+ "text/html": [
293
+ "<div>\n",
294
+ "<style scoped>\n",
295
+ " .dataframe tbody tr th:only-of-type {\n",
296
+ " vertical-align: middle;\n",
297
+ " }\n",
298
+ "\n",
299
+ " .dataframe tbody tr th {\n",
300
+ " vertical-align: top;\n",
301
+ " }\n",
302
+ "\n",
303
+ " .dataframe thead th {\n",
304
+ " text-align: right;\n",
305
+ " }\n",
306
+ "</style>\n",
307
+ "<table border=\"1\" class=\"dataframe\">\n",
308
+ " <thead>\n",
309
+ " <tr style=\"text-align: right;\">\n",
310
+ " <th></th>\n",
311
+ " <th>Employee_Name</th>\n",
312
+ " <th>EmpID</th>\n",
313
+ " <th>MarriedID</th>\n",
314
+ " <th>MaritalStatusID</th>\n",
315
+ " <th>GenderID</th>\n",
316
+ " <th>EmpStatusID</th>\n",
317
+ " <th>DeptID</th>\n",
318
+ " <th>PerfScoreID</th>\n",
319
+ " <th>FromDiversityJobFairID</th>\n",
320
+ " <th>Salary</th>\n",
321
+ " <th>...</th>\n",
322
+ " <th>ManagerName</th>\n",
323
+ " <th>ManagerID</th>\n",
324
+ " <th>RecruitmentSource</th>\n",
325
+ " <th>PerformanceScore</th>\n",
326
+ " <th>EngagementSurvey</th>\n",
327
+ " <th>EmpSatisfaction</th>\n",
328
+ " <th>SpecialProjectsCount</th>\n",
329
+ " <th>LastPerformanceReview_Date</th>\n",
330
+ " <th>DaysLateLast30</th>\n",
331
+ " <th>Absences</th>\n",
332
+ " </tr>\n",
333
+ " </thead>\n",
334
+ " <tbody>\n",
335
+ " <tr>\n",
336
+ " <th>0</th>\n",
337
+ " <td>None</td>\n",
338
+ " <td>(10022, 10042)</td>\n",
339
+ " <td>0</td>\n",
340
+ " <td>0</td>\n",
341
+ " <td>1</td>\n",
342
+ " <td>1</td>\n",
343
+ " <td>5</td>\n",
344
+ " <td>4</td>\n",
345
+ " <td>0</td>\n",
346
+ " <td>(62065, 63381)</td>\n",
347
+ " <td>...</td>\n",
348
+ " <td>Michael Albert</td>\n",
349
+ " <td>22.0</td>\n",
350
+ " <td>LinkedIn</td>\n",
351
+ " <td>Exceeds</td>\n",
352
+ " <td>(4.52, 4.68)</td>\n",
353
+ " <td>5</td>\n",
354
+ " <td>0</td>\n",
355
+ " <td>1/17/2019</td>\n",
356
+ " <td>0</td>\n",
357
+ " <td>1</td>\n",
358
+ " </tr>\n",
359
+ " <tr>\n",
360
+ " <th>1</th>\n",
361
+ " <td>None</td>\n",
362
+ " <td>(10064, 10084)</td>\n",
363
+ " <td>1</td>\n",
364
+ " <td>1</td>\n",
365
+ " <td>1</td>\n",
366
+ " <td>5</td>\n",
367
+ " <td>3</td>\n",
368
+ " <td>3</td>\n",
369
+ " <td>0</td>\n",
370
+ " <td>(92328, 104437)</td>\n",
371
+ " <td>...</td>\n",
372
+ " <td>Simon Roup</td>\n",
373
+ " <td>4.0</td>\n",
374
+ " <td>Indeed</td>\n",
375
+ " <td>Fully Meets</td>\n",
376
+ " <td>(4.9, 5.0)</td>\n",
377
+ " <td>3</td>\n",
378
+ " <td>6</td>\n",
379
+ " <td>None</td>\n",
380
+ " <td>0</td>\n",
381
+ " <td>17</td>\n",
382
+ " </tr>\n",
383
+ " <tr>\n",
384
+ " <th>2</th>\n",
385
+ " <td>None</td>\n",
386
+ " <td>(10190, 10210)</td>\n",
387
+ " <td>1</td>\n",
388
+ " <td>1</td>\n",
389
+ " <td>0</td>\n",
390
+ " <td>5</td>\n",
391
+ " <td>5</td>\n",
392
+ " <td>3</td>\n",
393
+ " <td>0</td>\n",
394
+ " <td>(64816, 66825)</td>\n",
395
+ " <td>...</td>\n",
396
+ " <td>Kissy Sullivan</td>\n",
397
+ " <td>20.0</td>\n",
398
+ " <td>LinkedIn</td>\n",
399
+ " <td>Fully Meets</td>\n",
400
+ " <td>(2.9, 3.18)</td>\n",
401
+ " <td>3</td>\n",
402
+ " <td>0</td>\n",
403
+ " <td>None</td>\n",
404
+ " <td>0</td>\n",
405
+ " <td>3</td>\n",
406
+ " </tr>\n",
407
+ " <tr>\n",
408
+ " <th>3</th>\n",
409
+ " <td>None</td>\n",
410
+ " <td>(10085, 10105)</td>\n",
411
+ " <td>1</td>\n",
412
+ " <td>1</td>\n",
413
+ " <td>0</td>\n",
414
+ " <td>1</td>\n",
415
+ " <td>5</td>\n",
416
+ " <td>3</td>\n",
417
+ " <td>0</td>\n",
418
+ " <td>(64816, 66825)</td>\n",
419
+ " <td>...</td>\n",
420
+ " <td>Elijiah Gray</td>\n",
421
+ " <td>16.0</td>\n",
422
+ " <td>Indeed</td>\n",
423
+ " <td>Fully Meets</td>\n",
424
+ " <td>(4.7, 4.88)</td>\n",
425
+ " <td>5</td>\n",
426
+ " <td>0</td>\n",
427
+ " <td>1/3/2019</td>\n",
428
+ " <td>0</td>\n",
429
+ " <td>15</td>\n",
430
+ " </tr>\n",
431
+ " <tr>\n",
432
+ " <th>4</th>\n",
433
+ " <td>None</td>\n",
434
+ " <td>(10064, 10084)</td>\n",
435
+ " <td>0</td>\n",
436
+ " <td>2</td>\n",
437
+ " <td>0</td>\n",
438
+ " <td>5</td>\n",
439
+ " <td>5</td>\n",
440
+ " <td>3</td>\n",
441
+ " <td>0</td>\n",
442
+ " <td>(47837, 51259)</td>\n",
443
+ " <td>...</td>\n",
444
+ " <td>Webster Butler</td>\n",
445
+ " <td>39.0</td>\n",
446
+ " <td>Google Search</td>\n",
447
+ " <td>Fully Meets</td>\n",
448
+ " <td>(5.0, 5.0)</td>\n",
449
+ " <td>4</td>\n",
450
+ " <td>0</td>\n",
451
+ " <td>2/1/2016</td>\n",
452
+ " <td>0</td>\n",
453
+ " <td>2</td>\n",
454
+ " </tr>\n",
455
+ " <tr>\n",
456
+ " <th>...</th>\n",
457
+ " <td>...</td>\n",
458
+ " <td>...</td>\n",
459
+ " <td>...</td>\n",
460
+ " <td>...</td>\n",
461
+ " <td>...</td>\n",
462
+ " <td>...</td>\n",
463
+ " <td>...</td>\n",
464
+ " <td>...</td>\n",
465
+ " <td>...</td>\n",
466
+ " <td>...</td>\n",
467
+ " <td>...</td>\n",
468
+ " <td>...</td>\n",
469
+ " <td>...</td>\n",
470
+ " <td>...</td>\n",
471
+ " <td>...</td>\n",
472
+ " <td>...</td>\n",
473
+ " <td>...</td>\n",
474
+ " <td>...</td>\n",
475
+ " <td>...</td>\n",
476
+ " <td>...</td>\n",
477
+ " <td>...</td>\n",
478
+ " </tr>\n",
479
+ " <tr>\n",
480
+ " <th>306</th>\n",
481
+ " <td>None</td>\n",
482
+ " <td>(10127, 10147)</td>\n",
483
+ " <td>0</td>\n",
484
+ " <td>0</td>\n",
485
+ " <td>1</td>\n",
486
+ " <td>1</td>\n",
487
+ " <td>5</td>\n",
488
+ " <td>3</td>\n",
489
+ " <td>0</td>\n",
490
+ " <td>(64816, 66825)</td>\n",
491
+ " <td>...</td>\n",
492
+ " <td>Kissy Sullivan</td>\n",
493
+ " <td>20.0</td>\n",
494
+ " <td>LinkedIn</td>\n",
495
+ " <td>Fully Meets</td>\n",
496
+ " <td>(3.99, 4.1)</td>\n",
497
+ " <td>4</td>\n",
498
+ " <td>0</td>\n",
499
+ " <td>2/28/2019</td>\n",
500
+ " <td>0</td>\n",
501
+ " <td>13</td>\n",
502
+ " </tr>\n",
503
+ " <tr>\n",
504
+ " <th>307</th>\n",
505
+ " <td>None</td>\n",
506
+ " <td>None</td>\n",
507
+ " <td>0</td>\n",
508
+ " <td>0</td>\n",
509
+ " <td>0</td>\n",
510
+ " <td>5</td>\n",
511
+ " <td>5</td>\n",
512
+ " <td>1</td>\n",
513
+ " <td>0</td>\n",
514
+ " <td>(47837, 51259)</td>\n",
515
+ " <td>...</td>\n",
516
+ " <td>Brannon Miller</td>\n",
517
+ " <td>12.0</td>\n",
518
+ " <td>Google Search</td>\n",
519
+ " <td>PIP</td>\n",
520
+ " <td>(3.19, 3.5)</td>\n",
521
+ " <td>2</td>\n",
522
+ " <td>0</td>\n",
523
+ " <td>None</td>\n",
524
+ " <td>5</td>\n",
525
+ " <td>4</td>\n",
526
+ " </tr>\n",
527
+ " <tr>\n",
528
+ " <th>308</th>\n",
529
+ " <td>None</td>\n",
530
+ " <td>(10001, 10021)</td>\n",
531
+ " <td>0</td>\n",
532
+ " <td>0</td>\n",
533
+ " <td>0</td>\n",
534
+ " <td>1</td>\n",
535
+ " <td>3</td>\n",
536
+ " <td>4</td>\n",
537
+ " <td>0</td>\n",
538
+ " <td>None</td>\n",
539
+ " <td>...</td>\n",
540
+ " <td>Janet King</td>\n",
541
+ " <td>2.0</td>\n",
542
+ " <td>Employee Referral</td>\n",
543
+ " <td>Exceeds</td>\n",
544
+ " <td>(4.52, 4.68)</td>\n",
545
+ " <td>5</td>\n",
546
+ " <td>6</td>\n",
547
+ " <td>2/21/2019</td>\n",
548
+ " <td>0</td>\n",
549
+ " <td>16</td>\n",
550
+ " </tr>\n",
551
+ " <tr>\n",
552
+ " <th>309</th>\n",
553
+ " <td>None</td>\n",
554
+ " <td>(10043, 10063)</td>\n",
555
+ " <td>0</td>\n",
556
+ " <td>0</td>\n",
557
+ " <td>0</td>\n",
558
+ " <td>1</td>\n",
559
+ " <td>3</td>\n",
560
+ " <td>3</td>\n",
561
+ " <td>0</td>\n",
562
+ " <td>(77692, 90100)</td>\n",
563
+ " <td>...</td>\n",
564
+ " <td>Simon Roup</td>\n",
565
+ " <td>4.0</td>\n",
566
+ " <td>Employee Referral</td>\n",
567
+ " <td>Fully Meets</td>\n",
568
+ " <td>(5.0, 5.0)</td>\n",
569
+ " <td>3</td>\n",
570
+ " <td>5</td>\n",
571
+ " <td>2/1/2019</td>\n",
572
+ " <td>0</td>\n",
573
+ " <td>11</td>\n",
574
+ " </tr>\n",
575
+ " <tr>\n",
576
+ " <th>310</th>\n",
577
+ " <td>None</td>\n",
578
+ " <td>(10252, 10271)</td>\n",
579
+ " <td>0</td>\n",
580
+ " <td>4</td>\n",
581
+ " <td>0</td>\n",
582
+ " <td>1</td>\n",
583
+ " <td>5</td>\n",
584
+ " <td>3</td>\n",
585
+ " <td>0</td>\n",
586
+ " <td>(45046, 47750)</td>\n",
587
+ " <td>...</td>\n",
588
+ " <td>David Stanley</td>\n",
589
+ " <td>14.0</td>\n",
590
+ " <td>LinkedIn</td>\n",
591
+ " <td>Fully Meets</td>\n",
592
+ " <td>(4.5, 4.52)</td>\n",
593
+ " <td>5</td>\n",
594
+ " <td>0</td>\n",
595
+ " <td>1/30/2019</td>\n",
596
+ " <td>0</td>\n",
597
+ " <td>2</td>\n",
598
+ " </tr>\n",
599
+ " </tbody>\n",
600
+ "</table>\n",
601
+ "<p>311 rows × 36 columns</p>\n",
602
+ "</div>"
603
+ ],
604
+ "text/plain": [
605
+ " Employee_Name EmpID MarriedID MaritalStatusID GenderID \\\n",
606
+ "0 None (10022, 10042) 0 0 1 \n",
607
+ "1 None (10064, 10084) 1 1 1 \n",
608
+ "2 None (10190, 10210) 1 1 0 \n",
609
+ "3 None (10085, 10105) 1 1 0 \n",
610
+ "4 None (10064, 10084) 0 2 0 \n",
611
+ ".. ... ... ... ... ... \n",
612
+ "306 None (10127, 10147) 0 0 1 \n",
613
+ "307 None None 0 0 0 \n",
614
+ "308 None (10001, 10021) 0 0 0 \n",
615
+ "309 None (10043, 10063) 0 0 0 \n",
616
+ "310 None (10252, 10271) 0 4 0 \n",
617
+ "\n",
618
+ " EmpStatusID DeptID PerfScoreID FromDiversityJobFairID \\\n",
619
+ "0 1 5 4 0 \n",
620
+ "1 5 3 3 0 \n",
621
+ "2 5 5 3 0 \n",
622
+ "3 1 5 3 0 \n",
623
+ "4 5 5 3 0 \n",
624
+ ".. ... ... ... ... \n",
625
+ "306 1 5 3 0 \n",
626
+ "307 5 5 1 0 \n",
627
+ "308 1 3 4 0 \n",
628
+ "309 1 3 3 0 \n",
629
+ "310 1 5 3 0 \n",
630
+ "\n",
631
+ " Salary ... ManagerName ManagerID RecruitmentSource \\\n",
632
+ "0 (62065, 63381) ... Michael Albert 22.0 LinkedIn \n",
633
+ "1 (92328, 104437) ... Simon Roup 4.0 Indeed \n",
634
+ "2 (64816, 66825) ... Kissy Sullivan 20.0 LinkedIn \n",
635
+ "3 (64816, 66825) ... Elijiah Gray 16.0 Indeed \n",
636
+ "4 (47837, 51259) ... Webster Butler 39.0 Google Search \n",
637
+ ".. ... ... ... ... ... \n",
638
+ "306 (64816, 66825) ... Kissy Sullivan 20.0 LinkedIn \n",
639
+ "307 (47837, 51259) ... Brannon Miller 12.0 Google Search \n",
640
+ "308 None ... Janet King 2.0 Employee Referral \n",
641
+ "309 (77692, 90100) ... Simon Roup 4.0 Employee Referral \n",
642
+ "310 (45046, 47750) ... David Stanley 14.0 LinkedIn \n",
643
+ "\n",
644
+ " PerformanceScore EngagementSurvey EmpSatisfaction SpecialProjectsCount \\\n",
645
+ "0 Exceeds (4.52, 4.68) 5 0 \n",
646
+ "1 Fully Meets (4.9, 5.0) 3 6 \n",
647
+ "2 Fully Meets (2.9, 3.18) 3 0 \n",
648
+ "3 Fully Meets (4.7, 4.88) 5 0 \n",
649
+ "4 Fully Meets (5.0, 5.0) 4 0 \n",
650
+ ".. ... ... ... ... \n",
651
+ "306 Fully Meets (3.99, 4.1) 4 0 \n",
652
+ "307 PIP (3.19, 3.5) 2 0 \n",
653
+ "308 Exceeds (4.52, 4.68) 5 6 \n",
654
+ "309 Fully Meets (5.0, 5.0) 3 5 \n",
655
+ "310 Fully Meets (4.5, 4.52) 5 0 \n",
656
+ "\n",
657
+ " LastPerformanceReview_Date DaysLateLast30 Absences \n",
658
+ "0 1/17/2019 0 1 \n",
659
+ "1 None 0 17 \n",
660
+ "2 None 0 3 \n",
661
+ "3 1/3/2019 0 15 \n",
662
+ "4 2/1/2016 0 2 \n",
663
+ ".. ... ... ... \n",
664
+ "306 2/28/2019 0 13 \n",
665
+ "307 None 5 4 \n",
666
+ "308 2/21/2019 0 16 \n",
667
+ "309 2/1/2019 0 11 \n",
668
+ "310 1/30/2019 0 2 \n",
669
+ "\n",
670
+ "[311 rows x 36 columns]"
671
+ ]
672
+ },
673
+ "execution_count": 4,
674
+ "metadata": {},
675
+ "output_type": "execute_result"
676
+ }
677
+ ],
678
+ "source": [
679
+ "K = 2\n",
680
+ "MAX_CATEGORICAL_SIZE = 50\n",
681
+ "BIN_SIZE = 20\n",
682
+ "SENSITIVITY_MINIMUM = 2\n",
683
+ "\n",
684
+ "def column_combinations(df, k):\n",
685
+ " return list(combinations(df.columns, k))\n",
686
+ "\n",
687
+ "def k_redact(df, k):\n",
688
+ " kwise_combinations = column_combinations(df, k) \n",
689
+ " \n",
690
+ " for columns in kwise_combinations:\n",
691
+ " df_search = df.loc[:, columns]\n",
692
+ " sensitive_data = [\n",
693
+ " (columns, key)\n",
694
+ " for key, value\n",
695
+ " in df_search.value_counts().to_dict().items()\n",
696
+ " if value == 1\n",
697
+ " ]\n",
698
+ " if not sensitive_data: continue\n",
699
+ " for columns, values in sensitive_data:\n",
700
+ " for column, value in zip(columns, values):\n",
701
+ " df_search = df_search.loc[df[column] == value]\n",
702
+ " if df_search.shape[0] == 1:\n",
703
+ " for column in columns:\n",
704
+ " df_search[column] = None\n",
705
+ " \n",
706
+ " return df\n",
707
+ "\n",
708
+ "def sensitive_values(series, sensitivity_minimum):\n",
709
+ " return {key\n",
710
+ " for key, value\n",
711
+ " in series.value_counts().to_dict().items()\n",
712
+ " if value < sensitivity_minimum\n",
713
+ " }\n",
714
+ "\n",
715
+ "def drop_sensitive(series, sensitivity_minimum):\n",
716
+ " series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
717
+ "\n",
718
+ "def bin_numeric(df, to_process, bin_size, sensitivity_minimum):\n",
719
+ " processed = set()\n",
720
+ " rows, _ = df.shape\n",
721
+ " num_bins = rows//bin_size\n",
722
+ " for column_name in to_process:\n",
723
+ " column = df[column_name]\n",
724
+ " if column.dtype.kind not in \"biufc\": continue\n",
725
+ " array = sorted(np.array(column))\n",
726
+ " array_min, array_max = array[0], array[-1]\n",
727
+ " splits = [array_min] + list(np.array_split(array, num_bins)) + [array_max]\n",
728
+ " bins = [\n",
729
+ " (np.min(split), np.max(split))\n",
730
+ " for split\n",
731
+ " in (splits[i] for i in range(num_bins))\n",
732
+ " ]\n",
733
+ " result = [None] * rows\n",
734
+ " for bin_min, bin_max in bins:\n",
735
+ " for i, value in enumerate(column):\n",
736
+ " if bin_min <= value <= bin_max:\n",
737
+ " result[i] = (bin_min, bin_max)\n",
738
+ " df[column_name] = result\n",
739
+ " drop_sensitive(df[column_name], sensitivity_minimum)\n",
740
+ " processed.add(column_name)\n",
741
+ " return df, to_process - processed\n",
742
+ "\n",
743
+ "def find_categorical(df, to_process, max_categorical_size, sensitivity_minimum):\n",
744
+ " processed = set()\n",
745
+ " for column_name in to_process:\n",
746
+ " column = df[column_name]\n",
747
+ " if column.nunique() <= max_categorical_size:\n",
748
+ " drop_sensitive(column, sensitivity_minimum)\n",
749
+ " processed.add(column_name)\n",
750
+ " return df, to_process - processed\n",
751
+ "\n",
752
+ "def redact(df, to_process, sensitivity_minimum):\n",
753
+ " processed = set()\n",
754
+ " for column_name in to_process:\n",
755
+ " column = df[column_name]\n",
756
+ " \n",
757
+ " is_object = column.dtype == object\n",
758
+ " if not is_object: continue\n",
759
+ "\n",
760
+ " # Check if any unique values exist, and redact them\n",
761
+ " drop_sensitive(column, sensitivity_minimum)\n",
762
+ " processed.add(column_name)\n",
763
+ "\n",
764
+ " return df, to_process - processed\n",
765
+ "\n",
766
+ "def anonymize(df, max_categorical_size, bin_size, sensitivity_minimum):\n",
767
+ " to_process = set(df.columns)\n",
768
+ " df, to_process = redact(df, to_process, sensitivity_minimum)\n",
769
+ " df, to_process = find_categorical(df, to_process, max_categorical_size, sensitivity_minimum)\n",
770
+ " df, to_process = bin_numeric(df, to_process, bin_size, sensitivity_minimum)\n",
771
+ " return df, to_process\n",
772
+ "\n",
773
+ "def data_anonymizer(df, k, max_categorical_size, bin_size, sensitivity_minimum):\n",
774
+ " start_dtypes = df.dtypes.to_dict()\n",
775
+ " df, unprocessed = anonymize(df, max_categorical_size, bin_size, sensitivity_minimum)\n",
776
+ " df = k_redact(df, k)\n",
777
+ " end_dtypes = df.dtypes.to_dict()\n",
778
+ "\n",
779
+ " # Type correction\n",
780
+ " for column in df.columns:\n",
781
+ " start_type, end_type = start_dtypes[column], end_dtypes[column]\n",
782
+ " if start_type == end_type: continue\n",
783
+ " if start_type.kind == \"i\" and end_type.kind == \"f\":\n",
784
+ " df[column] = df[column].astype(\"Int64\")\n",
785
+ "\n",
786
+ " return df, unprocessed\n",
787
+ "\n",
788
+ "df, unprocessed_columns = data_anonymizer(df, K, MAX_CATEGORICAL_SIZE, BIN_SIZE, SENSITIVITY_MINIMUM)\n",
789
+ "if unprocessed_columns: print(f\"Failed to process columns '{unprocessed_columns}'\")\n",
790
+ "df"
791
+ ]
792
  }
793
  ],
794
  "metadata": {
modules.py CHANGED
@@ -55,56 +55,106 @@ def data_cleaner(df, drop_missing=False, remove_duplicates=True):
55
  if remove_duplicates: df = df.drop_duplicates()
56
  return df
57
 
58
- def unique_ratio(df, col):
59
- return df[col].nunique()/df[col].count()
60
 
61
- def bin_numeric(df, name_col: str, num_bins: int):
62
-
63
- df_copy = df.copy().select_dtypes(include=np.number)
64
-
65
- col_name = df[name_col].sort_values()
66
- min_, max_ = col_name.min(), col_name.max()
67
- bins = np.array_split(col_name.values, num_bins)
68
- pivots = [min_] + [b[0] for b in bins[1:]] + [max_]
69
- bins_list = [(pivots[i], pivots[i+1]) for i in range(num_bins)]
70
-
71
- for bin_min, bin_max in bins_list:
72
-
73
- for row in df_copy.index:
74
- if bin_min <= df_copy.loc[row, name_col] < bin_max:
75
- df.loc[row, name_col] = f"{bin_min} - {bin_max}"
76
-
77
- return df
78
-
79
- def get_kanon_false(df, k=2):
80
- df = df.select_dtypes(include=np.number)
81
- k_anon_false = set() # columns containing non-unique k-tuples - need anonymization
82
- pairwise_combinations = list(combinations(df.columns, k)) # get k-wise combinations of all columns in data
83
- check = lambda x: x == k-1
84
-
85
- for k_tuple in pairwise_combinations:
86
-
87
- # if k_tuple in k_anon_false:
88
- # continue
89
-
90
- k_pair_counts = df.loc[:, k_tuple].value_counts().tolist() # checks for n_unique_values for each k-tuple
91
-
92
- if any(check(i) for i in k_pair_counts): # if any value corresponding to the k-tuple is >1, i.e. non-unique
93
- k_anon_false.add((k_tuple[0], unique_ratio(df, k_tuple[0])))
94
- k_anon_false.add((k_tuple[1], unique_ratio(df, k_tuple[1])))
95
 
96
- return sorted(k_anon_false, key = lambda x:x[1], reverse = True)
97
-
98
- def k_anonymize(df, k=2):
99
- k_anon_false = get_kanon_false(df)
100
- while k_anon_false:
101
- for i in k_anon_false:
102
- col, _ = i
103
- print(f"Binning {col}")
104
- df = bin_numeric(df, col, num_bins = 15)
105
- k_anon_false = get_kanon_false(df)
106
- print(f"Updated sensitivity: {k_anon_false}")
107
  return df
108
 
109
- def data_anonymizer(df, k=2):
110
- return k_anonymize(df, k)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  if remove_duplicates: df = df.drop_duplicates()
56
  return df
57
 
58
+ def column_combinations(df, k):
59
+ return list(combinations(df.columns, k))
60
 
61
+ def k_redact(df, k):
62
+ kwise_combinations = column_combinations(df, k)
63
+
64
+ for columns in kwise_combinations:
65
+ df_search = df.loc[:, columns]
66
+ sensitive_data = [
67
+ (columns, key)
68
+ for key, value
69
+ in df_search.value_counts().to_dict().items()
70
+ if value == 1
71
+ ]
72
+ if not sensitive_data: continue
73
+ for columns, values in sensitive_data:
74
+ for column, value in zip(columns, values):
75
+ df_search = df_search.loc[df[column] == value]
76
+ if df_search.shape[0] == 1:
77
+ for column in columns:
78
+ df_search[column] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
 
 
 
 
 
 
 
 
 
 
 
80
  return df
81
 
82
+ def sensitive_values(series, sensitivity_minimum):
83
+ return {key
84
+ for key, value
85
+ in series.value_counts().to_dict().items()
86
+ if value < sensitivity_minimum
87
+ }
88
+
89
+ def drop_sensitive(series, sensitivity_minimum):
90
+ series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None
91
+
92
+ def bin_numeric(df, to_process, bin_size, sensitivity_minimum):
93
+ processed = set()
94
+ rows, _ = df.shape
95
+ num_bins = rows//bin_size
96
+ for column_name in to_process:
97
+ column = df[column_name]
98
+ if column.dtype.kind not in "biufc": continue
99
+ array = sorted(np.array(column))
100
+ array_min, array_max = array[0], array[-1]
101
+ splits = [array_min] + list(np.array_split(array, num_bins)) + [array_max]
102
+ bins = [
103
+ (np.min(split), np.max(split))
104
+ for split
105
+ in (splits[i] for i in range(num_bins))
106
+ ]
107
+ result = [None] * rows
108
+ for bin_min, bin_max in bins:
109
+ for i, value in enumerate(column):
110
+ if bin_min <= value <= bin_max:
111
+ result[i] = (bin_min, bin_max)
112
+ df[column_name] = result
113
+ drop_sensitive(df[column_name], sensitivity_minimum)
114
+ processed.add(column_name)
115
+ return df, to_process - processed
116
+
117
+ def find_categorical(df, to_process, max_categorical_size, sensitivity_minimum):
118
+ processed = set()
119
+ for column_name in to_process:
120
+ column = df[column_name]
121
+ if column.nunique() <= max_categorical_size:
122
+ drop_sensitive(column, sensitivity_minimum)
123
+ processed.add(column_name)
124
+ return df, to_process - processed
125
+
126
+ def redact(df, to_process, sensitivity_minimum):
127
+ processed = set()
128
+ for column_name in to_process:
129
+ column = df[column_name]
130
+
131
+ is_object = column.dtype == object
132
+ if not is_object: continue
133
+
134
+ # Check if any unique values exist, and redact them
135
+ drop_sensitive(column, sensitivity_minimum)
136
+ processed.add(column_name)
137
+
138
+ return df, to_process - processed
139
+
140
+ def anonymize(df, max_categorical_size, bin_size, sensitivity_minimum):
141
+ to_process = set(df.columns)
142
+ df, to_process = redact(df, to_process, sensitivity_minimum)
143
+ df, to_process = find_categorical(df, to_process, max_categorical_size, sensitivity_minimum)
144
+ df, to_process = bin_numeric(df, to_process, bin_size, sensitivity_minimum)
145
+ return df, to_process
146
+
147
+ def data_anonymizer(df, k, max_categorical_size, bin_size, sensitivity_minimum):
148
+ start_dtypes = df.dtypes.to_dict()
149
+ df, unprocessed = anonymize(df, max_categorical_size, bin_size, sensitivity_minimum)
150
+ df = k_redact(df, k)
151
+ end_dtypes = df.dtypes.to_dict()
152
+
153
+ # Type correction
154
+ for column in df.columns:
155
+ start_type, end_type = start_dtypes[column], end_dtypes[column]
156
+ if start_type == end_type: continue
157
+ if start_type.kind == "i" and end_type.kind == "f":
158
+ df[column] = df[column].astype("Int64")
159
+
160
+ return df, unprocessed