Spaces:

2hack2furious
/

anonymizer

Runtime error

App Files Files

ziggycross commited on Mar 6, 2023

Commit

a51662f

•

1 Parent(s): 6c3e9dd

Implemented k_anonymizer.

Browse files

Files changed (1) hide show

modules.py +55 -2

modules.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import pandas as pd
 SUPPORTED_TYPES = [".csv", ".json", ".xlsx"]
@@ -53,5 +55,56 @@ def data_cleaner(df, drop_missing=False, remove_duplicates=True):
     if remove_duplicates: df = df.drop_duplicates()
     return df
-def data_anonymizer(df):
-    return df

+from itertools import combinations
+import numpy as np
 import pandas as pd
 SUPPORTED_TYPES = [".csv", ".json", ".xlsx"]
     if remove_duplicates: df = df.drop_duplicates()
     return df
+def unique_ratio(df, col):
+    return df[col].nunique()/df[col].count()
+def bin_numeric(df, name_col: str, num_bins: int):
+    df_copy = df.copy().select_dtypes(include=np.number)
+    col_name = df[name_col].sort_values()
+    min_, max_ = col_name.min(), col_name.max()
+    bins = np.array_split(col_name.values, num_bins)
+    pivots = [min_] + [b[0] for b in bins[1:]] + [max_]
+    bins_list = [(pivots[i], pivots[i+1]) for i in range(num_bins)]
+    for bin_min, bin_max in bins_list:
+        for row in df_copy.index:
+            if bin_min <= df_copy.loc[row, name_col] < bin_max:
+                df.loc[row, name_col] = f"{bin_min} - {bin_max}"
+    return df
+def get_kanon_false(df, k=2):
+    df = df.select_dtypes(include=np.number)
+    k_anon_false = set() # columns containing non-unique k-tuples - need anonymization
+    pairwise_combinations = list(combinations(df.columns, k)) # get k-wise combinations of all columns in data
+    check = lambda x: x == k-1
+    for k_tuple in pairwise_combinations:
+            # if k_tuple in k_anon_false:
+            #     continue
+        k_pair_counts = df.loc[:, k_tuple].value_counts().tolist() # checks for n_unique_values for each k-tuple
+        if any(check(i) for i in k_pair_counts): # if any value corresponding to the k-tuple is >1, i.e. non-unique
+            k_anon_false.add((k_tuple[0], unique_ratio(df, k_tuple[0])))
+            k_anon_false.add((k_tuple[1], unique_ratio(df, k_tuple[1])))
+    return sorted(k_anon_false, key = lambda x:x[1], reverse = True)
+def k_anonymize(df, k=2):
+    k_anon_false = get_kanon_false(df)
+    while k_anon_false:
+        for i in k_anon_false:
+            col, _ = i
+            print(f"Binning {col}")
+            df = bin_numeric(df, col, num_bins = 15)
+            k_anon_false = get_kanon_false(df)
+            print(f"Updated sensitivity: {k_anon_false}")
+    return df
+def data_anonymizer(df, k=2):
+    return k_anonymize(df, k)