jfrery-zama commited on
Commit
2b591f4
1 Parent(s): b5afc24

add probability along with detected words

Browse files
Files changed (4) hide show
  1. README copy.md +0 -55
  2. README.md +50 -7
  3. app.py +15 -7
  4. fhe_anonymizer.py +34 -22
README copy.md DELETED
@@ -1,55 +0,0 @@
1
- ---
2
- title: Encrypted Anonymization Using Fully Homomorphic Encryption
3
- emoji: 🕵️‍♂️ 🔒
4
- colorFrom: blue
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 3.40.0
8
- app_file: app.py
9
- pinned: true
10
- tags:
11
- - FHE
12
- - PPML
13
- - privacy
14
- - privacy preserving machine learning
15
- - data anonymization
16
- - homomorphic encryption
17
- - security
18
- python_version: 3.10.11
19
- ---
20
-
21
- # Data Anonymization using FHE
22
-
23
- ## Run the application locally
24
-
25
- ### Install the dependencies
26
-
27
- First, create a virtual env and activate it:
28
-
29
- ```bash
30
- python3 -m venv .venv
31
- source .venv/bin/activate
32
- ```
33
-
34
- Then, install the required packages:
35
-
36
- ```python
37
- pip3 install pip --upgrade
38
- pip3 install -U pip wheel setuptools --ignore-installed
39
- pip3 install -r requirements.txt --ignore-installed
40
- ```
41
-
42
- The above steps should only be done once.
43
-
44
- ## Run the app
45
-
46
- In a terminal, run:
47
-
48
- ```bash
49
- source .venv/bin/activate
50
- python3 anonymize_app.py
51
- ```
52
-
53
- ## Interact with the application
54
-
55
- Open the given URL link (search for a line like `Running on local URL: http://127.0.0.1:8888/`).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,12 +1,55 @@
1
  ---
2
- title: Encrypted Anonymization
3
- emoji: 🐠
4
- colorFrom: purple
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.20.0
8
  app_file: app.py
9
- pinned: false
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Encrypted Anonymization Using Fully Homomorphic Encryption
3
+ emoji: 🕵️‍♂️ 🔒
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 3.40.0
8
  app_file: app.py
9
+ pinned: true
10
+ tags:
11
+ - FHE
12
+ - PPML
13
+ - privacy
14
+ - privacy preserving machine learning
15
+ - data anonymization
16
+ - homomorphic encryption
17
+ - security
18
+ python_version: 3.8.16
19
  ---
20
 
21
+ # Data Anonymization using FHE
22
+
23
+ ## Run the application locally
24
+
25
+ ### Install the dependencies
26
+
27
+ First, create a virtual env and activate it:
28
+
29
+ ```bash
30
+ python3 -m venv .venv
31
+ source .venv/bin/activate
32
+ ```
33
+
34
+ Then, install the required packages:
35
+
36
+ ```python
37
+ pip3 install pip --upgrade
38
+ pip3 install -U pip wheel setuptools --ignore-installed
39
+ pip3 install -r requirements.txt --ignore-installed
40
+ ```
41
+
42
+ The above steps should only be done once.
43
+
44
+ ## Run the app
45
+
46
+ In a terminal, run:
47
+
48
+ ```bash
49
+ source .venv/bin/activate
50
+ python3 app.py
51
+ ```
52
+
53
+ ## Interact with the application
54
+
55
+ Open the given URL link (search for a line like `Running on local URL: http://127.0.0.1:8888/`).
app.py CHANGED
@@ -10,12 +10,15 @@ anonymizer = FHEAnonymizer()
10
 
11
 
12
  def deidentify_text(input_text):
13
- anonymized_text, identified_words = anonymizer(input_text)
14
- # Convert the list of identified words into a DataFrame
15
- if identified_words: # Ensure there are identified words to process
16
- identified_df = pd.DataFrame(identified_words, columns=["Identified Words"])
 
 
 
17
  else:
18
- identified_df = pd.DataFrame(columns=["Identified Words"])
19
  return anonymized_text, identified_df
20
 
21
 
@@ -76,7 +79,12 @@ with demo:
76
  )
77
 
78
  with gr.Row():
79
- input_text = gr.Textbox(value=default_demo_text, lines=13, placeholder="Input text here...", label="Input")
 
 
 
 
 
80
 
81
  anonymized_text_output = gr.Textbox(label="Anonymized Text", lines=13)
82
 
@@ -92,4 +100,4 @@ with demo:
92
 
93
 
94
  # Launch the app
95
- demo.launch(share=False)
 
10
 
11
 
12
  def deidentify_text(input_text):
13
+ anonymized_text, identified_words_with_prob = anonymizer(input_text)
14
+
15
+ # Convert the list of identified words and probabilities into a DataFrame
16
+ if identified_words_with_prob:
17
+ identified_df = pd.DataFrame(
18
+ identified_words_with_prob, columns=["Identified Words", "Probability"]
19
+ )
20
  else:
21
+ identified_df = pd.DataFrame(columns=["Identified Words", "Probability"])
22
  return anonymized_text, identified_df
23
 
24
 
 
79
  )
80
 
81
  with gr.Row():
82
+ input_text = gr.Textbox(
83
+ value=default_demo_text,
84
+ lines=13,
85
+ placeholder="Input text here...",
86
+ label="Input",
87
+ )
88
 
89
  anonymized_text_output = gr.Textbox(label="Anonymized Text", lines=13)
90
 
 
100
 
101
 
102
  # Launch the app
103
+ demo.launch(share=False)
fhe_anonymizer.py CHANGED
@@ -6,10 +6,13 @@ from concrete.ml.common.serialization.loaders import load
6
 
7
  base_dir = Path(__file__).parent
8
 
 
9
  class FHEAnonymizer:
10
  def __init__(self, punctuation_list=".,!?:;"):
11
 
12
- self.embeddings_model = gensim.models.FastText.load(str(base_dir / "embedded_model.model"))
 
 
13
  self.punctuation_list = punctuation_list
14
  with open(base_dir / "cml_xgboost.model", "r") as model_file:
15
  self.fhe_ner_detection = load(file=model_file)
@@ -28,17 +31,19 @@ class FHEAnonymizer:
28
 
29
  def __call__(self, text: str):
30
  text = self.preprocess_sentences(text)
31
- identified_words = []
32
  new_text = []
33
 
34
  for word in text.split():
35
  # Prediction for each word
36
  x = self.embeddings_model.wv[word][None]
37
- prediction = self.fhe_ner_detection.predict(x)
38
  # prediction = self.fhe_inference(x).argmax(1)[0]
39
-
 
 
40
  if prediction == 1:
41
- identified_words.append(word)
42
  new_text.append("<REMOVED>")
43
  else:
44
  new_text.append(word)
@@ -46,29 +51,36 @@ class FHEAnonymizer:
46
  # Joining the modified text
47
  modified_text = " ".join(new_text)
48
 
49
- return modified_text, identified_words
50
 
51
  def preprocess_sentences(self, sentence, verbose=False):
52
  """Preprocess the sentence."""
53
 
54
- sentence = re.sub(r'\n+', ' ', sentence)
55
- if verbose: print(sentence)
 
56
 
57
- sentence = re.sub(' +', ' ', sentence)
58
- if verbose: print(sentence)
 
59
 
60
  sentence = re.sub(r"'s\b", " s", sentence)
61
- if verbose: print(sentence)
62
-
63
- sentence = re.sub(r'\s([,.!?;:])', r'\1', sentence)
64
- if verbose: print(sentence)
65
-
66
- pattern = r'(?<!\w)[{}]|[{}](?!\w)'.format(re.escape(self.punctuation_list), re.escape(self.punctuation_list))
67
- sentence = re.sub(pattern, '', sentence)
68
- if verbose: print(sentence)
69
-
70
- sentence = re.sub(r'\s([,.!?;:])', r'\1', sentence)
71
- if verbose: print(sentence)
72
-
 
 
 
 
 
73
 
74
  return sentence
 
6
 
7
  base_dir = Path(__file__).parent
8
 
9
+
10
  class FHEAnonymizer:
11
  def __init__(self, punctuation_list=".,!?:;"):
12
 
13
+ self.embeddings_model = gensim.models.FastText.load(
14
+ str(base_dir / "embedded_model.model")
15
+ )
16
  self.punctuation_list = punctuation_list
17
  with open(base_dir / "cml_xgboost.model", "r") as model_file:
18
  self.fhe_ner_detection = load(file=model_file)
 
31
 
32
  def __call__(self, text: str):
33
  text = self.preprocess_sentences(text)
34
+ identified_words_with_prob = [] # tuples of (word, probability)
35
  new_text = []
36
 
37
  for word in text.split():
38
  # Prediction for each word
39
  x = self.embeddings_model.wv[word][None]
40
+ prediction_proba = self.fhe_ner_detection.predict_proba(x)
41
  # prediction = self.fhe_inference(x).argmax(1)[0]
42
+ # print(word, prediction)
43
+ probability = prediction_proba[0][1]
44
+ prediction = probability >= 0.5
45
  if prediction == 1:
46
+ identified_words_with_prob.append((word, probability))
47
  new_text.append("<REMOVED>")
48
  else:
49
  new_text.append(word)
 
51
  # Joining the modified text
52
  modified_text = " ".join(new_text)
53
 
54
+ return modified_text, identified_words_with_prob
55
 
56
  def preprocess_sentences(self, sentence, verbose=False):
57
  """Preprocess the sentence."""
58
 
59
+ sentence = re.sub(r"\n+", " ", sentence)
60
+ if verbose:
61
+ print(sentence)
62
 
63
+ sentence = re.sub(" +", " ", sentence)
64
+ if verbose:
65
+ print(sentence)
66
 
67
  sentence = re.sub(r"'s\b", " s", sentence)
68
+ if verbose:
69
+ print(sentence)
70
+
71
+ sentence = re.sub(r"\s([,.!?;:])", r"\1", sentence)
72
+ if verbose:
73
+ print(sentence)
74
+
75
+ pattern = r"(?<!\w)[{}]|[{}](?!\w)".format(
76
+ re.escape(self.punctuation_list), re.escape(self.punctuation_list)
77
+ )
78
+ sentence = re.sub(pattern, "", sentence)
79
+ if verbose:
80
+ print(sentence)
81
+
82
+ sentence = re.sub(r"\s([,.!?;:])", r"\1", sentence)
83
+ if verbose:
84
+ print(sentence)
85
 
86
  return sentence