Spaces:

2hack2furious
/

anonymizer

Runtime error

App Files Files

ziggycross commited on Mar 6, 2023

Commit

003953a

•

1 Parent(s): a51662f

Improved k-anonymizer.

Browse files

Files changed (3) hide show

app.py +10 -10
loader-cleaner.ipynb +707 -3
modules.py +99 -49

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from streamlit_extras.let_it_rain import rain
 # Options
 DISCLAIMER = "*Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam urna sem, bibendum efficitur pellentesque a, sollicitudin pharetra urna. Nam vel lectus vitae elit luctus feugiat a a purus. Aenean mollis quis ipsum sed ornare. Nunc sit amet ultricies tellus. Vivamus vulputate sem id molestie viverra. Etiam egestas lobortis enim, sit amet lobortis ligula sollicitudin vel. Nunc eget ipsum sollicitudin, convallis.*"
 # Page Config
 st.set_page_config(layout="wide")
@@ -22,19 +23,18 @@ if df is None: # Await file to be uploaded
 else:
     ### PRE-TRANSFORM features for sidebar
     with st.sidebar:
-        # Options for data cleaning
-        with st.container() as cleaning_options:
-            st.markdown("### Data cleaning options:")
             remove_duplicates = st.checkbox("Remove duplicate rows", value=True)
             drop_missing = st.checkbox("Remove rows with missing values", value=False)
         # Options for data optimization
         with st.container() as anonymizing_options:
             st.markdown("### Anonymizing options:")
-            sample_checkbox = st.checkbox("Test checkbox", value=True)
-            sample_slider = st.slider("Test slider", min_value=1, max_value=10, value=2)
-            sample_number = st.number_input("Test number", min_value=0, max_value=100, value=50)
-            sample_dropdown = st.selectbox("Test dropdown", ["A", "B", "C"], index=1)
     ### DATA PREVIEW AND TRANSFORM
@@ -46,8 +46,7 @@ else:
     # Transform data
     df = modules.data_cleaner(df, drop_missing, remove_duplicates)
-    df = modules.data_anonymizer(df)
-    # download_file = modules.create_file(df, ".csv")
     # Preview data after before_data
     with st.container() as after_data:
@@ -60,8 +59,9 @@ else:
     with st.sidebar:
         # Options for download
         with st.container() as download_header:
-            st.markdown("### Download")
             output_extension = st.selectbox("File type", [".csv", ".json", ".xlsx"])
         # Prepare file for download
         with st.container() as downloader:

 # Options
 DISCLAIMER = "*Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam urna sem, bibendum efficitur pellentesque a, sollicitudin pharetra urna. Nam vel lectus vitae elit luctus feugiat a a purus. Aenean mollis quis ipsum sed ornare. Nunc sit amet ultricies tellus. Vivamus vulputate sem id molestie viverra. Etiam egestas lobortis enim, sit amet lobortis ligula sollicitudin vel. Nunc eget ipsum sollicitudin, convallis.*"
+K = 2
 # Page Config
 st.set_page_config(layout="wide")
 else:
     ### PRE-TRANSFORM features for sidebar
     with st.sidebar:
+        # Options for data loading
+        with st.container() as loading_options:
+            st.markdown("### Data loading options:")
             remove_duplicates = st.checkbox("Remove duplicate rows", value=True)
             drop_missing = st.checkbox("Remove rows with missing values", value=False)
         # Options for data optimization
         with st.container() as anonymizing_options:
             st.markdown("### Anonymizing options:")
+            max_categorical_size = st.slider("Maximum number of categories", min_value=2, max_value=200, value=50)
+            bin_size = st.slider("Target bin size", min_value=2, max_value=200, value=20)
+            sensitivity_minimum = st.number_input("Minimum count", min_value=2, max_value=10, value=2)
     ### DATA PREVIEW AND TRANSFORM
     # Transform data
     df = modules.data_cleaner(df, drop_missing, remove_duplicates)
+    df, unprocessed = modules.data_anonymizer(df, K, max_categorical_size, bin_size, sensitivity_minimum)
     # Preview data after before_data
     with st.container() as after_data:
     with st.sidebar:
         # Options for download
         with st.container() as download_header:
+            st.markdown("### Download options:")
             output_extension = st.selectbox("File type", [".csv", ".json", ".xlsx"])
+            if unprocessed: st.markdown(f"Error encountered when processing columns {str(unprocessed)}")
         # Prepare file for download
         with st.container() as downloader:

loader-cleaner.ipynb CHANGED Viewed

@@ -6,8 +6,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "import os"
    ]
   },
   {
@@ -79,12 +81,714 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "DROP_MISSING = True\n",
     "REMOVE_DUPLICATES = True\n",
     "\n",
     "df = df.dropna(how=\"any\" if DROP_MISSING else \"all\")\n",
     "if REMOVE_DUPLICATES: df = df.drop_duplicates()"
    ]
   }
  ],
  "metadata": {

    "metadata": {},
    "outputs": [],
    "source": [
+    "from itertools import combinations\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import pandas as pd"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "DROP_MISSING = False\n",
     "REMOVE_DUPLICATES = True\n",
     "\n",
     "df = df.dropna(how=\"any\" if DROP_MISSING else \"all\")\n",
     "if REMOVE_DUPLICATES: df = df.drop_duplicates()"
    ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Anonymize data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+      "/var/folders/7t/d7j4tqwj061958h80lldp0dh0000gn/T/ipykernel_19712/355045301.py:38: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Employee_Name</th>\n",
+       "      <th>EmpID</th>\n",
+       "      <th>MarriedID</th>\n",
+       "      <th>MaritalStatusID</th>\n",
+       "      <th>GenderID</th>\n",
+       "      <th>EmpStatusID</th>\n",
+       "      <th>DeptID</th>\n",
+       "      <th>PerfScoreID</th>\n",
+       "      <th>FromDiversityJobFairID</th>\n",
+       "      <th>Salary</th>\n",
+       "      <th>...</th>\n",
+       "      <th>ManagerName</th>\n",
+       "      <th>ManagerID</th>\n",
+       "      <th>RecruitmentSource</th>\n",
+       "      <th>PerformanceScore</th>\n",
+       "      <th>EngagementSurvey</th>\n",
+       "      <th>EmpSatisfaction</th>\n",
+       "      <th>SpecialProjectsCount</th>\n",
+       "      <th>LastPerformanceReview_Date</th>\n",
+       "      <th>DaysLateLast30</th>\n",
+       "      <th>Absences</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>None</td>\n",
+       "      <td>(10022, 10042)</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(62065, 63381)</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Michael Albert</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>LinkedIn</td>\n",
+       "      <td>Exceeds</td>\n",
+       "      <td>(4.52, 4.68)</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1/17/2019</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>None</td>\n",
+       "      <td>(10064, 10084)</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(92328, 104437)</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Simon Roup</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Indeed</td>\n",
+       "      <td>Fully Meets</td>\n",
+       "      <td>(4.9, 5.0)</td>\n",
+       "      <td>3</td>\n",
+       "      <td>6</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>17</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>None</td>\n",
+       "      <td>(10190, 10210)</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>5</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(64816, 66825)</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Kissy Sullivan</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>LinkedIn</td>\n",
+       "      <td>Fully Meets</td>\n",
+       "      <td>(2.9, 3.18)</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>None</td>\n",
+       "      <td>(10085, 10105)</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(64816, 66825)</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Elijiah Gray</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>Indeed</td>\n",
+       "      <td>Fully Meets</td>\n",
+       "      <td>(4.7, 4.88)</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1/3/2019</td>\n",
+       "      <td>0</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>None</td>\n",
+       "      <td>(10064, 10084)</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>5</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(47837, 51259)</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Webster Butler</td>\n",
+       "      <td>39.0</td>\n",
+       "      <td>Google Search</td>\n",
+       "      <td>Fully Meets</td>\n",
+       "      <td>(5.0, 5.0)</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2/1/2016</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>306</th>\n",
+       "      <td>None</td>\n",
+       "      <td>(10127, 10147)</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(64816, 66825)</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Kissy Sullivan</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>LinkedIn</td>\n",
+       "      <td>Fully Meets</td>\n",
+       "      <td>(3.99, 4.1)</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2/28/2019</td>\n",
+       "      <td>0</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>307</th>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(47837, 51259)</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Brannon Miller</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>Google Search</td>\n",
+       "      <td>PIP</td>\n",
+       "      <td>(3.19, 3.5)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>5</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>308</th>\n",
+       "      <td>None</td>\n",
+       "      <td>(10001, 10021)</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Janet King</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>Employee Referral</td>\n",
+       "      <td>Exceeds</td>\n",
+       "      <td>(4.52, 4.68)</td>\n",
+       "      <td>5</td>\n",
+       "      <td>6</td>\n",
+       "      <td>2/21/2019</td>\n",
+       "      <td>0</td>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>309</th>\n",
+       "      <td>None</td>\n",
+       "      <td>(10043, 10063)</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(77692, 90100)</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Simon Roup</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Employee Referral</td>\n",
+       "      <td>Fully Meets</td>\n",
+       "      <td>(5.0, 5.0)</td>\n",
+       "      <td>3</td>\n",
+       "      <td>5</td>\n",
+       "      <td>2/1/2019</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>310</th>\n",
+       "      <td>None</td>\n",
+       "      <td>(10252, 10271)</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>(45046, 47750)</td>\n",
+       "      <td>...</td>\n",
+       "      <td>David Stanley</td>\n",
+       "      <td>14.0</td>\n",
+       "      <td>LinkedIn</td>\n",
+       "      <td>Fully Meets</td>\n",
+       "      <td>(4.5, 4.52)</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1/30/2019</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>311 rows × 36 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    Employee_Name           EmpID  MarriedID  MaritalStatusID  GenderID  \\\n",
+       "0            None  (10022, 10042)          0                0         1   \n",
+       "1            None  (10064, 10084)          1                1         1   \n",
+       "2            None  (10190, 10210)          1                1         0   \n",
+       "3            None  (10085, 10105)          1                1         0   \n",
+       "4            None  (10064, 10084)          0                2         0   \n",
+       "..            ...             ...        ...              ...       ...   \n",
+       "306          None  (10127, 10147)          0                0         1   \n",
+       "307          None            None          0                0         0   \n",
+       "308          None  (10001, 10021)          0                0         0   \n",
+       "309          None  (10043, 10063)          0                0         0   \n",
+       "310          None  (10252, 10271)          0                4         0   \n",
+       "\n",
+       "     EmpStatusID  DeptID  PerfScoreID  FromDiversityJobFairID  \\\n",
+       "0              1       5            4                       0   \n",
+       "1              5       3            3                       0   \n",
+       "2              5       5            3                       0   \n",
+       "3              1       5            3                       0   \n",
+       "4              5       5            3                       0   \n",
+       "..           ...     ...          ...                     ...   \n",
+       "306            1       5            3                       0   \n",
+       "307            5       5            1                       0   \n",
+       "308            1       3            4                       0   \n",
+       "309            1       3            3                       0   \n",
+       "310            1       5            3                       0   \n",
+       "\n",
+       "              Salary  ...     ManagerName  ManagerID  RecruitmentSource  \\\n",
+       "0     (62065, 63381)  ...  Michael Albert       22.0           LinkedIn   \n",
+       "1    (92328, 104437)  ...      Simon Roup        4.0             Indeed   \n",
+       "2     (64816, 66825)  ...  Kissy Sullivan       20.0           LinkedIn   \n",
+       "3     (64816, 66825)  ...    Elijiah Gray       16.0             Indeed   \n",
+       "4     (47837, 51259)  ...  Webster Butler       39.0      Google Search   \n",
+       "..               ...  ...             ...        ...                ...   \n",
+       "306   (64816, 66825)  ...  Kissy Sullivan       20.0           LinkedIn   \n",
+       "307   (47837, 51259)  ...  Brannon Miller       12.0      Google Search   \n",
+       "308             None  ...      Janet King        2.0  Employee Referral   \n",
+       "309   (77692, 90100)  ...      Simon Roup        4.0  Employee Referral   \n",
+       "310   (45046, 47750)  ...   David Stanley       14.0           LinkedIn   \n",
+       "\n",
+       "    PerformanceScore EngagementSurvey EmpSatisfaction SpecialProjectsCount  \\\n",
+       "0            Exceeds     (4.52, 4.68)               5                    0   \n",
+       "1        Fully Meets       (4.9, 5.0)               3                    6   \n",
+       "2        Fully Meets      (2.9, 3.18)               3                    0   \n",
+       "3        Fully Meets      (4.7, 4.88)               5                    0   \n",
+       "4        Fully Meets       (5.0, 5.0)               4                    0   \n",
+       "..               ...              ...             ...                  ...   \n",
+       "306      Fully Meets      (3.99, 4.1)               4                    0   \n",
+       "307              PIP      (3.19, 3.5)               2                    0   \n",
+       "308          Exceeds     (4.52, 4.68)               5                    6   \n",
+       "309      Fully Meets       (5.0, 5.0)               3                    5   \n",
+       "310      Fully Meets      (4.5, 4.52)               5                    0   \n",
+       "\n",
+       "    LastPerformanceReview_Date DaysLateLast30 Absences  \n",
+       "0                    1/17/2019              0        1  \n",
+       "1                         None              0       17  \n",
+       "2                         None              0        3  \n",
+       "3                     1/3/2019              0       15  \n",
+       "4                     2/1/2016              0        2  \n",
+       "..                         ...            ...      ...  \n",
+       "306                  2/28/2019              0       13  \n",
+       "307                       None              5        4  \n",
+       "308                  2/21/2019              0       16  \n",
+       "309                   2/1/2019              0       11  \n",
+       "310                  1/30/2019              0        2  \n",
+       "\n",
+       "[311 rows x 36 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "K = 2\n",
+    "MAX_CATEGORICAL_SIZE = 50\n",
+    "BIN_SIZE = 20\n",
+    "SENSITIVITY_MINIMUM = 2\n",
+    "\n",
+    "def column_combinations(df, k):\n",
+    "    return list(combinations(df.columns, k))\n",
+    "\n",
+    "def k_redact(df, k):\n",
+    "    kwise_combinations = column_combinations(df, k) \n",
+    "    \n",
+    "    for columns in kwise_combinations:\n",
+    "        df_search = df.loc[:, columns]\n",
+    "        sensitive_data = [\n",
+    "            (columns, key)\n",
+    "            for key, value\n",
+    "            in df_search.value_counts().to_dict().items()\n",
+    "            if value == 1\n",
+    "            ]\n",
+    "        if not sensitive_data: continue\n",
+    "        for columns, values in sensitive_data:\n",
+    "            for column, value in zip(columns, values):\n",
+    "                df_search = df_search.loc[df[column] == value]\n",
+    "                if df_search.shape[0] == 1:\n",
+    "                    for column in columns:\n",
+    "                        df_search[column] = None\n",
+    "    \n",
+    "    return df\n",
+    "\n",
+    "def sensitive_values(series, sensitivity_minimum):\n",
+    "    return {key\n",
+    "        for key, value\n",
+    "        in series.value_counts().to_dict().items()\n",
+    "        if value < sensitivity_minimum\n",
+    "        }\n",
+    "\n",
+    "def drop_sensitive(series, sensitivity_minimum):\n",
+    "    series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None\n",
+    "\n",
+    "def bin_numeric(df, to_process, bin_size, sensitivity_minimum):\n",
+    "    processed = set()\n",
+    "    rows, _ = df.shape\n",
+    "    num_bins = rows//bin_size\n",
+    "    for column_name in to_process:\n",
+    "        column = df[column_name]\n",
+    "        if column.dtype.kind not in \"biufc\": continue\n",
+    "        array = sorted(np.array(column))\n",
+    "        array_min, array_max = array[0], array[-1]\n",
+    "        splits = [array_min] + list(np.array_split(array, num_bins)) + [array_max]\n",
+    "        bins = [\n",
+    "            (np.min(split), np.max(split))\n",
+    "            for split\n",
+    "            in (splits[i] for i in range(num_bins))\n",
+    "            ]\n",
+    "        result = [None] * rows\n",
+    "        for bin_min, bin_max in bins:\n",
+    "            for i, value in enumerate(column):\n",
+    "                if bin_min <= value <= bin_max:\n",
+    "                    result[i] = (bin_min, bin_max)\n",
+    "        df[column_name] = result\n",
+    "        drop_sensitive(df[column_name], sensitivity_minimum)\n",
+    "        processed.add(column_name)\n",
+    "    return df, to_process - processed\n",
+    "\n",
+    "def find_categorical(df, to_process, max_categorical_size, sensitivity_minimum):\n",
+    "    processed = set()\n",
+    "    for column_name in to_process:\n",
+    "        column = df[column_name]\n",
+    "        if column.nunique() <= max_categorical_size:\n",
+    "            drop_sensitive(column, sensitivity_minimum)\n",
+    "            processed.add(column_name)\n",
+    "    return df, to_process - processed\n",
+    "\n",
+    "def redact(df, to_process, sensitivity_minimum):\n",
+    "    processed = set()\n",
+    "    for column_name in to_process:\n",
+    "        column = df[column_name]\n",
+    "        \n",
+    "        is_object = column.dtype == object\n",
+    "        if not is_object: continue\n",
+    "\n",
+    "        # Check if any unique values exist, and redact them\n",
+    "        drop_sensitive(column, sensitivity_minimum)\n",
+    "        processed.add(column_name)\n",
+    "\n",
+    "    return df, to_process - processed\n",
+    "\n",
+    "def anonymize(df, max_categorical_size, bin_size, sensitivity_minimum):\n",
+    "    to_process = set(df.columns)\n",
+    "    df, to_process = redact(df, to_process, sensitivity_minimum)\n",
+    "    df, to_process = find_categorical(df, to_process, max_categorical_size, sensitivity_minimum)\n",
+    "    df, to_process = bin_numeric(df, to_process, bin_size, sensitivity_minimum)\n",
+    "    return df, to_process\n",
+    "\n",
+    "def data_anonymizer(df, k, max_categorical_size, bin_size, sensitivity_minimum):\n",
+    "    start_dtypes = df.dtypes.to_dict()\n",
+    "    df, unprocessed = anonymize(df, max_categorical_size, bin_size, sensitivity_minimum)\n",
+    "    df = k_redact(df, k)\n",
+    "    end_dtypes = df.dtypes.to_dict()\n",
+    "\n",
+    "    # Type correction\n",
+    "    for column in df.columns:\n",
+    "        start_type, end_type  = start_dtypes[column], end_dtypes[column]\n",
+    "        if start_type == end_type: continue\n",
+    "        if start_type.kind == \"i\" and end_type.kind == \"f\":\n",
+    "            df[column] = df[column].astype(\"Int64\")\n",
+    "\n",
+    "    return df, unprocessed\n",
+    "\n",
+    "df, unprocessed_columns = data_anonymizer(df, K, MAX_CATEGORICAL_SIZE, BIN_SIZE, SENSITIVITY_MINIMUM)\n",
+    "if unprocessed_columns: print(f\"Failed to process columns '{unprocessed_columns}'\")\n",
+    "df"
+   ]
   }
  ],
  "metadata": {

modules.py CHANGED Viewed

@@ -55,56 +55,106 @@ def data_cleaner(df, drop_missing=False, remove_duplicates=True):
     if remove_duplicates: df = df.drop_duplicates()
     return df
-def unique_ratio(df, col):
-    return df[col].nunique()/df[col].count()
-def bin_numeric(df, name_col: str, num_bins: int):
-    df_copy = df.copy().select_dtypes(include=np.number)
-    col_name = df[name_col].sort_values()
-    min_, max_ = col_name.min(), col_name.max()
-    bins = np.array_split(col_name.values, num_bins)
-    pivots = [min_] + [b[0] for b in bins[1:]] + [max_]
-    bins_list = [(pivots[i], pivots[i+1]) for i in range(num_bins)]
-    for bin_min, bin_max in bins_list:
-        for row in df_copy.index:
-            if bin_min <= df_copy.loc[row, name_col] < bin_max:
-                df.loc[row, name_col] = f"{bin_min} - {bin_max}"
-    return df
-def get_kanon_false(df, k=2):
-    df = df.select_dtypes(include=np.number)
-    k_anon_false = set() # columns containing non-unique k-tuples - need anonymization
-    pairwise_combinations = list(combinations(df.columns, k)) # get k-wise combinations of all columns in data
-    check = lambda x: x == k-1
-    for k_tuple in pairwise_combinations:
-            # if k_tuple in k_anon_false:
-            #     continue
-        k_pair_counts = df.loc[:, k_tuple].value_counts().tolist() # checks for n_unique_values for each k-tuple
-        if any(check(i) for i in k_pair_counts): # if any value corresponding to the k-tuple is >1, i.e. non-unique
-            k_anon_false.add((k_tuple[0], unique_ratio(df, k_tuple[0])))
-            k_anon_false.add((k_tuple[1], unique_ratio(df, k_tuple[1])))
-    return sorted(k_anon_false, key = lambda x:x[1], reverse = True)
-def k_anonymize(df, k=2):
-    k_anon_false = get_kanon_false(df)
-    while k_anon_false:
-        for i in k_anon_false:
-            col, _ = i
-            print(f"Binning {col}")
-            df = bin_numeric(df, col, num_bins = 15)
-            k_anon_false = get_kanon_false(df)
-            print(f"Updated sensitivity: {k_anon_false}")
     return df
-def data_anonymizer(df, k=2):
-    return k_anonymize(df, k)

     if remove_duplicates: df = df.drop_duplicates()
     return df
+def column_combinations(df, k):
+    return list(combinations(df.columns, k))
+def k_redact(df, k):
+    kwise_combinations = column_combinations(df, k)
+    for columns in kwise_combinations:
+        df_search = df.loc[:, columns]
+        sensitive_data = [
+            (columns, key)
+            for key, value
+            in df_search.value_counts().to_dict().items()
+            if value == 1
+            ]
+        if not sensitive_data: continue
+        for columns, values in sensitive_data:
+            for column, value in zip(columns, values):
+                df_search = df_search.loc[df[column] == value]
+                if df_search.shape[0] == 1:
+                    for column in columns:
+                        df_search[column] = None
     return df
+def sensitive_values(series, sensitivity_minimum):
+    return {key
+        for key, value
+        in series.value_counts().to_dict().items()
+        if value < sensitivity_minimum
+        }
+def drop_sensitive(series, sensitivity_minimum):
+    series.loc[series.isin(sensitive_values(series, sensitivity_minimum))] = None
+def bin_numeric(df, to_process, bin_size, sensitivity_minimum):
+    processed = set()
+    rows, _ = df.shape
+    num_bins = rows//bin_size
+    for column_name in to_process:
+        column = df[column_name]
+        if column.dtype.kind not in "biufc": continue
+        array = sorted(np.array(column))
+        array_min, array_max = array[0], array[-1]
+        splits = [array_min] + list(np.array_split(array, num_bins)) + [array_max]
+        bins = [
+            (np.min(split), np.max(split))
+            for split
+            in (splits[i] for i in range(num_bins))
+            ]
+        result = [None] * rows
+        for bin_min, bin_max in bins:
+            for i, value in enumerate(column):
+                if bin_min <= value <= bin_max:
+                    result[i] = (bin_min, bin_max)
+        df[column_name] = result
+        drop_sensitive(df[column_name], sensitivity_minimum)
+        processed.add(column_name)
+    return df, to_process - processed
+def find_categorical(df, to_process, max_categorical_size, sensitivity_minimum):
+    processed = set()
+    for column_name in to_process:
+        column = df[column_name]
+        if column.nunique() <= max_categorical_size:
+            drop_sensitive(column, sensitivity_minimum)
+            processed.add(column_name)
+    return df, to_process - processed
+def redact(df, to_process, sensitivity_minimum):
+    processed = set()
+    for column_name in to_process:
+        column = df[column_name]
+        is_object = column.dtype == object
+        if not is_object: continue
+        # Check if any unique values exist, and redact them
+        drop_sensitive(column, sensitivity_minimum)
+        processed.add(column_name)
+    return df, to_process - processed
+def anonymize(df, max_categorical_size, bin_size, sensitivity_minimum):
+    to_process = set(df.columns)
+    df, to_process = redact(df, to_process, sensitivity_minimum)
+    df, to_process = find_categorical(df, to_process, max_categorical_size, sensitivity_minimum)
+    df, to_process = bin_numeric(df, to_process, bin_size, sensitivity_minimum)
+    return df, to_process
+def data_anonymizer(df, k, max_categorical_size, bin_size, sensitivity_minimum):
+    start_dtypes = df.dtypes.to_dict()
+    df, unprocessed = anonymize(df, max_categorical_size, bin_size, sensitivity_minimum)
+    df = k_redact(df, k)
+    end_dtypes = df.dtypes.to_dict()
+    # Type correction
+    for column in df.columns:
+        start_type, end_type  = start_dtypes[column], end_dtypes[column]
+        if start_type == end_type: continue
+        if start_type.kind == "i" and end_type.kind == "f":
+            df[column] = df[column].astype("Int64")
+    return df, unprocessed