ziggycross commited on
Commit
a51662f
1 Parent(s): 6c3e9dd

Implemented k_anonymizer.

Browse files
Files changed (1) hide show
  1. modules.py +55 -2
modules.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import pandas as pd
2
 
3
  SUPPORTED_TYPES = [".csv", ".json", ".xlsx"]
@@ -53,5 +55,56 @@ def data_cleaner(df, drop_missing=False, remove_duplicates=True):
53
  if remove_duplicates: df = df.drop_duplicates()
54
  return df
55
 
56
- def data_anonymizer(df):
57
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import combinations
2
+ import numpy as np
3
  import pandas as pd
4
 
5
  SUPPORTED_TYPES = [".csv", ".json", ".xlsx"]
 
55
  if remove_duplicates: df = df.drop_duplicates()
56
  return df
57
 
58
+ def unique_ratio(df, col):
59
+ return df[col].nunique()/df[col].count()
60
+
61
+ def bin_numeric(df, name_col: str, num_bins: int):
62
+
63
+ df_copy = df.copy().select_dtypes(include=np.number)
64
+
65
+ col_name = df[name_col].sort_values()
66
+ min_, max_ = col_name.min(), col_name.max()
67
+ bins = np.array_split(col_name.values, num_bins)
68
+ pivots = [min_] + [b[0] for b in bins[1:]] + [max_]
69
+ bins_list = [(pivots[i], pivots[i+1]) for i in range(num_bins)]
70
+
71
+ for bin_min, bin_max in bins_list:
72
+
73
+ for row in df_copy.index:
74
+ if bin_min <= df_copy.loc[row, name_col] < bin_max:
75
+ df.loc[row, name_col] = f"{bin_min} - {bin_max}"
76
+
77
+ return df
78
+
79
+ def get_kanon_false(df, k=2):
80
+ df = df.select_dtypes(include=np.number)
81
+ k_anon_false = set() # columns containing non-unique k-tuples - need anonymization
82
+ pairwise_combinations = list(combinations(df.columns, k)) # get k-wise combinations of all columns in data
83
+ check = lambda x: x == k-1
84
+
85
+ for k_tuple in pairwise_combinations:
86
+
87
+ # if k_tuple in k_anon_false:
88
+ # continue
89
+
90
+ k_pair_counts = df.loc[:, k_tuple].value_counts().tolist() # checks for n_unique_values for each k-tuple
91
+
92
+ if any(check(i) for i in k_pair_counts): # if any value corresponding to the k-tuple is >1, i.e. non-unique
93
+ k_anon_false.add((k_tuple[0], unique_ratio(df, k_tuple[0])))
94
+ k_anon_false.add((k_tuple[1], unique_ratio(df, k_tuple[1])))
95
+
96
+ return sorted(k_anon_false, key = lambda x:x[1], reverse = True)
97
+
98
+ def k_anonymize(df, k=2):
99
+ k_anon_false = get_kanon_false(df)
100
+ while k_anon_false:
101
+ for i in k_anon_false:
102
+ col, _ = i
103
+ print(f"Binning {col}")
104
+ df = bin_numeric(df, col, num_bins = 15)
105
+ k_anon_false = get_kanon_false(df)
106
+ print(f"Updated sensitivity: {k_anon_false}")
107
+ return df
108
+
109
+ def data_anonymizer(df, k=2):
110
+ return k_anonymize(df, k)