jfrery-zama commited on
Commit
d0b1031
·
1 Parent(s): 1dfccc3

update anonymize file in clear with roberta +update uuid map with query id

Browse files
Files changed (3) hide show
  1. anonymize_file_clear.py +13 -7
  2. app.py +1 -1
  3. fhe_anonymizer.py +5 -5
anonymize_file_clear.py CHANGED
@@ -5,15 +5,21 @@ import uuid
5
  from pathlib import Path
6
  import gensim
7
  from concrete.ml.common.serialization.loaders import load
 
 
8
 
9
  def load_models():
10
  base_dir = Path(__file__).parent / "models"
11
- embeddings_model = gensim.models.FastText.load(str(base_dir / "without_pronoun_embedded_model.model"))
12
- with open(base_dir / "without_pronoun_cml_xgboost.model", "r") as model_file:
 
 
 
 
13
  fhe_ner_detection = load(file=model_file)
14
- return embeddings_model, fhe_ner_detection
15
 
16
- def anonymize_text(text, embeddings_model, fhe_ner_detection):
17
  token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
18
  tokens = re.findall(token_pattern, text)
19
  uuid_map = {}
@@ -21,7 +27,7 @@ def anonymize_text(text, embeddings_model, fhe_ner_detection):
21
 
22
  for token in tokens:
23
  if token.strip() and re.match(r"\w+", token): # If the token is a word
24
- x = embeddings_model.wv[token][None]
25
  prediction_proba = fhe_ner_detection.predict_proba(x)
26
  probability = prediction_proba[0][1]
27
  prediction = probability >= 0.5
@@ -42,7 +48,7 @@ def main():
42
  parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
43
  args = parser.parse_args()
44
 
45
- embeddings_model, fhe_ner_detection = load_models()
46
 
47
  # Read the input file
48
  with open(args.file_path, 'r', encoding='utf-8') as file:
@@ -54,7 +60,7 @@ def main():
54
  original_file.write(text)
55
 
56
  # Anonymize the text
57
- anonymized_text, uuid_map = anonymize_text(text, embeddings_model, fhe_ner_detection)
58
 
59
  # Save the anonymized text to its specified file
60
  anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"
 
5
  from pathlib import Path
6
  import gensim
7
  from concrete.ml.common.serialization.loaders import load
8
+ from transformers import AutoTokenizer, AutoModel
9
+ from utils_demo import get_batch_text_representation
10
 
11
  def load_models():
12
  base_dir = Path(__file__).parent / "models"
13
+
14
+ # Load tokenizer and model
15
+ tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
16
+ embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
17
+
18
+ with open(base_dir / "cml_logreg.model", "r") as model_file:
19
  fhe_ner_detection = load(file=model_file)
20
+ return embeddings_model, tokenizer, fhe_ner_detection
21
 
22
+ def anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection):
23
  token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
24
  tokens = re.findall(token_pattern, text)
25
  uuid_map = {}
 
27
 
28
  for token in tokens:
29
  if token.strip() and re.match(r"\w+", token): # If the token is a word
30
+ x = get_batch_text_representation([token], embeddings_model, tokenizer)
31
  prediction_proba = fhe_ner_detection.predict_proba(x)
32
  probability = prediction_proba[0][1]
33
  prediction = probability >= 0.5
 
48
  parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
49
  args = parser.parse_args()
50
 
51
+ embeddings_model, tokenizer, fhe_ner_detection = load_models()
52
 
53
  # Read the input file
54
  with open(args.file_path, 'r', encoding='utf-8') as file:
 
60
  original_file.write(text)
61
 
62
  # Anonymize the text
63
+ anonymized_text, uuid_map = anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection)
64
 
65
  # Save the anonymized text to its specified file
66
  anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"
app.py CHANGED
@@ -142,7 +142,7 @@ with demo:
142
 
143
  examples_radio.change(lambda example_query: example_query, inputs=[examples_radio], outputs=[input_text])
144
 
145
- anonymized_text_output = gr.Textbox(label="Anonymized Text with FHE", lines=1)
146
 
147
  identified_words_output = gr.Dataframe(label="Identified Words", visible=False)
148
 
 
142
 
143
  examples_radio.change(lambda example_query: example_query, inputs=[examples_radio], outputs=[input_text])
144
 
145
+ anonymized_text_output = gr.Textbox(label="Anonymized Text with FHE", lines=1, interactive=True)
146
 
147
  identified_words_output = gr.Dataframe(label="Identified Words", visible=False)
148
 
fhe_anonymizer.py CHANGED
@@ -14,13 +14,11 @@ base_dir = Path(__file__).parent
14
  class FHEAnonymizer:
15
  def __init__(self, punctuation_list=".,!?:;"):
16
 
17
- # Load tokenizer and model, move model to the selected device
18
  self.tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
19
  self.embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
20
 
21
  self.punctuation_list = punctuation_list
22
- with open(base_dir / "models/without_pronoun_cml_xgboost.model", "r") as model_file:
23
- self.fhe_ner_detection = load(file=model_file)
24
 
25
  with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
26
  self.uuid_map = json.load(file)
@@ -44,7 +42,6 @@ class FHEAnonymizer:
44
  identified_words_with_prob = []
45
  processed_tokens = []
46
 
47
- print(tokens)
48
  for token in tokens:
49
  # Directly append non-word tokens or whitespace to processed_tokens
50
  if not token.strip() or not re.match(r"\w+", token):
@@ -54,7 +51,6 @@ class FHEAnonymizer:
54
  # Prediction for each word
55
  x = get_batch_text_representation([token], self.embeddings_model, self.tokenizer)
56
 
57
- # prediction_proba = self.fhe_ner_detection.predict_proba(x)
58
  prediction_proba = self.fhe_inference(x)
59
  probability = prediction_proba[0][1]
60
 
@@ -68,6 +64,10 @@ class FHEAnonymizer:
68
  else:
69
  processed_tokens.append(token)
70
 
 
 
 
 
71
  # Reconstruct the sentence
72
  reconstructed_sentence = ''.join(processed_tokens)
73
  return reconstructed_sentence, identified_words_with_prob
 
14
  class FHEAnonymizer:
15
  def __init__(self, punctuation_list=".,!?:;"):
16
 
17
+ # Load tokenizer and model
18
  self.tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
19
  self.embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
20
 
21
  self.punctuation_list = punctuation_list
 
 
22
 
23
  with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
24
  self.uuid_map = json.load(file)
 
42
  identified_words_with_prob = []
43
  processed_tokens = []
44
 
 
45
  for token in tokens:
46
  # Directly append non-word tokens or whitespace to processed_tokens
47
  if not token.strip() or not re.match(r"\w+", token):
 
51
  # Prediction for each word
52
  x = get_batch_text_representation([token], self.embeddings_model, self.tokenizer)
53
 
 
54
  prediction_proba = self.fhe_inference(x)
55
  probability = prediction_proba[0][1]
56
 
 
64
  else:
65
  processed_tokens.append(token)
66
 
67
+ # Update the UUID map with query.
68
+ with open(base_dir / "original_document_uuid_mapping.json", 'w') as file:
69
+ json.dump(self.uuid_map, file)
70
+
71
  # Reconstruct the sentence
72
  reconstructed_sentence = ''.join(processed_tokens)
73
  return reconstructed_sentence, identified_words_with_prob