jfrery-zama commited on
Commit
628fe8f
Β·
1 Parent(s): be67bc2

use without pronoun model

Browse files
anonymize_file_clear.py CHANGED
@@ -7,9 +7,9 @@ import gensim
7
  from concrete.ml.common.serialization.loaders import load
8
 
9
  def load_models():
10
- base_dir = Path(__file__).parent
11
- embeddings_model = gensim.models.FastText.load(str(base_dir / "embedded_model.model"))
12
- with open(base_dir / "cml_xgboost.model", "r") as model_file:
13
  fhe_ner_detection = load(file=model_file)
14
  return embeddings_model, fhe_ner_detection
15
 
@@ -34,7 +34,8 @@ def anonymize_text(text, embeddings_model, fhe_ner_detection):
34
  else:
35
  processed_tokens.append(token) # Preserve punctuation and spaces as is
36
 
37
- return uuid_map
 
38
 
39
  def main():
40
  parser = argparse.ArgumentParser(description="Anonymize named entities in a text file and save the mapping to a JSON file.")
@@ -47,14 +48,26 @@ def main():
47
  with open(args.file_path, 'r', encoding='utf-8') as file:
48
  text = file.read()
49
 
 
 
 
 
 
50
  # Anonymize the text
51
- uuid_map = anonymize_text(text, embeddings_model, fhe_ner_detection)
 
 
 
 
 
52
 
53
  # Save the UUID mapping to a JSON file
54
  mapping_path = Path(args.file_path).stem + "_uuid_mapping.json"
55
  with open(mapping_path, 'w', encoding='utf-8') as file:
56
  json.dump(uuid_map, file, indent=4, sort_keys=True)
57
 
 
 
58
  print(f"UUID mapping saved to {mapping_path}")
59
 
60
  if __name__ == "__main__":
 
7
  from concrete.ml.common.serialization.loaders import load
8
 
9
  def load_models():
10
+ base_dir = Path(__file__).parent / "models"
11
+ embeddings_model = gensim.models.FastText.load(str(base_dir / "without_pronoun_embedded_model.model"))
12
+ with open(base_dir / "without_pronoun_cml_xgboost.model", "r") as model_file:
13
  fhe_ner_detection = load(file=model_file)
14
  return embeddings_model, fhe_ner_detection
15
 
 
34
  else:
35
  processed_tokens.append(token) # Preserve punctuation and spaces as is
36
 
37
+ anonymized_text = ''.join(processed_tokens)
38
+ return anonymized_text, uuid_map
39
 
40
  def main():
41
  parser = argparse.ArgumentParser(description="Anonymize named entities in a text file and save the mapping to a JSON file.")
 
48
  with open(args.file_path, 'r', encoding='utf-8') as file:
49
  text = file.read()
50
 
51
+ # Save the original text to its specified file
52
+ original_file_path = Path(__file__).parent / "files" / "original_document.txt"
53
+ with open(original_file_path, 'w', encoding='utf-8') as original_file:
54
+ original_file.write(text)
55
+
56
  # Anonymize the text
57
+ anonymized_text, uuid_map = anonymize_text(text, embeddings_model, fhe_ner_detection)
58
+
59
+ # Save the anonymized text to its specified file
60
+ anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"
61
+ with open(anonymized_file_path, 'w', encoding='utf-8') as anonymized_file:
62
+ anonymized_file.write(anonymized_text)
63
 
64
  # Save the UUID mapping to a JSON file
65
  mapping_path = Path(args.file_path).stem + "_uuid_mapping.json"
66
  with open(mapping_path, 'w', encoding='utf-8') as file:
67
  json.dump(uuid_map, file, indent=4, sort_keys=True)
68
 
69
+ print(f"Original text saved to {original_file_path}")
70
+ print(f"Anonymized text saved to {anonymized_file_path}")
71
  print(f"UUID mapping saved to {mapping_path}")
72
 
73
  if __name__ == "__main__":
app.py CHANGED
@@ -59,13 +59,13 @@ def query_chatgpt(anonymized_query):
59
  tokens = re.findall(token_pattern, anonymized_response)
60
  processed_tokens = []
61
 
62
- print(tokens)
63
  for token in tokens:
64
  # Directly append non-word tokens or whitespace to processed_tokens
65
  if not token.strip() or not re.match(r"\w+", token):
66
  processed_tokens.append(token)
67
  continue
68
- print(token)
69
  if token in inverse_uuid_map:
70
  processed_tokens.append(inverse_uuid_map[token])
71
  else:
@@ -136,7 +136,7 @@ with demo:
136
 
137
  anonymized_text_output = gr.Textbox(label="Anonymized Text with FHE", lines=13)
138
 
139
- identified_words_output = gr.Dataframe(label="Identified Words", visible=False)
140
 
141
  submit_button = gr.Button("Anonymize with FHE")
142
 
 
59
  tokens = re.findall(token_pattern, anonymized_response)
60
  processed_tokens = []
61
 
62
+
63
  for token in tokens:
64
  # Directly append non-word tokens or whitespace to processed_tokens
65
  if not token.strip() or not re.match(r"\w+", token):
66
  processed_tokens.append(token)
67
  continue
68
+
69
  if token in inverse_uuid_map:
70
  processed_tokens.append(inverse_uuid_map[token])
71
  else:
 
136
 
137
  anonymized_text_output = gr.Textbox(label="Anonymized Text with FHE", lines=13)
138
 
139
+ identified_words_output = gr.Dataframe(label="Identified Words", visible=True)
140
 
141
  submit_button = gr.Button("Anonymize with FHE")
142
 
demo_text.txt CHANGED
@@ -1 +1 @@
1
- Who lives in Maine?
 
1
+ who lives in Maine?
deployment/client.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ef07d7867a023a0d3aeab3d029b5ab4941786841fee38ce993b38af00392142
3
- size 253880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e4798bf93e38a14f5f1aa15203bb093cf15c4dfee7edbd8e0f7767605755090
3
+ size 129876
deployment/server.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c446388b3e562c5babd8f98d00229e78d5c46952764b2bf577ca70bd5d96cb0
3
- size 24234
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86fc185025bf0c84d50aef44f14308c0077db2701a9a835bf4e3b912a58cb9b0
3
+ size 5639
fhe_anonymizer.py CHANGED
@@ -13,10 +13,10 @@ class FHEAnonymizer:
13
  def __init__(self, punctuation_list=".,!?:;"):
14
 
15
  self.embeddings_model = gensim.models.FastText.load(
16
- str(base_dir / "embedded_model.model")
17
  )
18
  self.punctuation_list = punctuation_list
19
- with open(base_dir / "cml_xgboost.model", "r") as model_file:
20
  self.fhe_ner_detection = load(file=model_file)
21
 
22
  with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
@@ -50,7 +50,8 @@ class FHEAnonymizer:
50
 
51
  # Prediction for each word
52
  x = self.embeddings_model.wv[token][None]
53
- prediction_proba = self.fhe_ner_detection.predict_proba(x)
 
54
  probability = prediction_proba[0][1]
55
 
56
  if probability >= 0.5:
 
13
  def __init__(self, punctuation_list=".,!?:;"):
14
 
15
  self.embeddings_model = gensim.models.FastText.load(
16
+ str(base_dir / "models/without_pronoun_embedded_model.model")
17
  )
18
  self.punctuation_list = punctuation_list
19
+ with open(base_dir / "models/without_pronoun_cml_xgboost.model", "r") as model_file:
20
  self.fhe_ner_detection = load(file=model_file)
21
 
22
  with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
 
50
 
51
  # Prediction for each word
52
  x = self.embeddings_model.wv[token][None]
53
+ # prediction_proba = self.fhe_ner_detection.predict_proba(x)
54
+ prediction_proba = self.fhe_inference(x)
55
  probability = prediction_proba[0][1]
56
 
57
  if probability >= 0.5:
files/anonymized_document.txt CHANGED
@@ -1,10 +1,10 @@
1
- Hello, 0a182475 name is 84e24eb8 0ea0e35c and 9f2963af live in 5d0a593d.
2
- 6d7a71e0 credit card number is 1155d2a5 and 0a182475 d8e7c7fc c627b841 id is be0eddce.
3
 
4
- On b926f540 ee1fa38f 9f2963af visited 1e63e774 and sent an email to f70cf334, from the IP 2d66aefe.
5
 
6
- 6d7a71e0 passport: 53b52085 and 0a182475 phone number: 81133b16 08aeb9f5.
7
 
8
- 3bbcaf02 is a valid 11e27c41 5a60bcfa 3730179e 97008100: ab18e6bb . Can 773b08cb f4953628 check the 8ff83dc3 on bank account e1a23ada?
9
 
10
- 7ab7335c's social security number is 11e90148. 67e5e61e driver license? it is c6652117.
 
1
+ Hello, my name is 97dc4202 7ce27ecb and I live in aaf4b006.
2
+ My credit card number is bfd59a59 and my c7184a17 516361a1 f8380bf5 is edf660df.
3
 
4
+ On d615d819 3f343449 I visited b6394fb9 and 732237ac an email to a295c5d0, from the c591bc5d c83dd929.
5
 
6
+ My passport: c263d176 and my c402f998 number: a054c8c2 8fddc160.
7
 
8
+ This is a 08876c6f c5462fed 49b9cffb 3658044b Number: 2f075e1d . Can you please e51c8e1c the f1b9c36f on 4fd4e4c4 aa960526 148fea84?
9
 
10
+ a18f3dda's bcda6774 security number is 48f7c8a4. Her driver license? it is ab7ec0c3.
files/question_demo.txt DELETED
@@ -1,10 +0,0 @@
1
- Strategic Focus: What are the primary areas of focus in the strategic development plan for the technology firm, and why were they chosen?
2
- Revenue Growth: How does the company plan to achieve a 20% increase in revenue through expansion into emerging markets?
3
- Innovation Investment: What specific types of AI algorithms and cloud solutions is the company planning to develop with the allocated $100 million R&D investment?
4
- Partnerships and Acquisitions Strategy: Can you explain the criteria used for selecting startups for partnerships and acquisitions in the AI and IoT sectors?
5
- Risk Management: What are the key risks identified for the company, and what strategies are in place to mitigate these risks?
6
- Financial Projections: Based on the strategic initiatives outlined, what are the projected financial outcomes for the company over the next three years?
7
- Market Competition: How does the company plan to continuously analyze and adapt to competitive strategies in the technology sector?
8
- Regulatory Compliance: What measures will the company take to ensure adherence to global regulations, especially when expanding into new markets?
9
- Product Portfolio: How will the strategic partnerships and acquisitions enhance the company's product portfolio?
10
- Sustainability: What steps is the company taking to ensure the sustainability of its growth in the face of technological changes and market competition?
 
 
 
 
 
 
 
 
 
 
 
cml_xgboost.model β†’ models/cml_xgboost.model RENAMED
File without changes
embedded_model.model β†’ models/embedded_model.model RENAMED
File without changes
embedded_model.model.wv.vectors_ngrams.npy β†’ models/embedded_model.model.wv.vectors_ngrams.npy RENAMED
File without changes
models/without_pronoun_cml_xgboost.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:933d1d5c5f83c30211dd9a497482c517a822df809c0498fed164de72bd7bf910
3
+ size 1085795
models/without_pronoun_embedded_model.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:762240ca4040c68e44c403f16abce5683a0c4a005ec10f3dd0135a0e429a66c1
3
+ size 1189196
models/without_pronoun_embedded_model.model.wv.vectors_ngrams.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cf06fe78185b373c97ee0616f599ce6b1aceb6445b8f666fac6cd4cd307fe46
3
+ size 400000128
original_document_uuid_mapping.json CHANGED
@@ -1,34 +1,35 @@
1
  {
2
- "078-05-1126": "11e90148",
3
- "1234567A": "c6652117",
4
- "16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ": "be0eddce",
5
- "18": "ee1fa38f",
6
- "191280342": "53b52085",
7
- "192.168.0.1": "2d66aefe",
8
- "212": "81133b16",
9
- "4095-2609-9393-4932": "1155d2a5",
10
- "555-1234": "08aeb9f5",
11
- "954567876544": "e1a23ada",
12
- "Account": "3730179e",
13
- "Bank": "5a60bcfa",
14
- "David": "84e24eb8",
15
- "Her": "67e5e61e",
16
- "I": "9f2963af",
17
- "IL150120690000003111111": "ab18e6bb",
18
- "International": "11e27c41",
19
- "Johnson": "0ea0e35c",
20
- "Kate": "7ab7335c",
21
- "Maine": "5d0a593d",
22
- "My": "6d7a71e0",
23
- "Number": "97008100",
24
- "September": "b926f540",
25
- "This": "3bbcaf02",
26
- "crypto": "d8e7c7fc",
27
- "microsoft.com": "1e63e774",
28
- "my": "0a182475",
29
- "please": "f4953628",
30
- "status": "8ff83dc3",
31
- "test@presidio.site": "f70cf334",
32
- "wallet": "c627b841",
33
- "you": "773b08cb"
 
34
  }
 
1
  {
2
+ "078-05-1126": "48f7c8a4",
3
+ "1234567A": "ab7ec0c3",
4
+ "16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ": "edf660df",
5
+ "18": "3f343449",
6
+ "191280342": "c263d176",
7
+ "192.168.0.1": "c83dd929",
8
+ "212": "a054c8c2",
9
+ "4095-2609-9393-4932": "bfd59a59",
10
+ "555-1234": "8fddc160",
11
+ "954567876544": "148fea84",
12
+ "Account": "3658044b",
13
+ "Bank": "49b9cffb",
14
+ "David": "97dc4202",
15
+ "IL150120690000003111111": "2f075e1d",
16
+ "IP": "c591bc5d",
17
+ "International": "c5462fed",
18
+ "Johnson": "7ce27ecb",
19
+ "Kate": "a18f3dda",
20
+ "Maine": "aaf4b006",
21
+ "September": "d615d819",
22
+ "account": "aa960526",
23
+ "bank": "4fd4e4c4",
24
+ "check": "e51c8e1c",
25
+ "crypto": "c7184a17",
26
+ "id": "f8380bf5",
27
+ "microsoft.com": "b6394fb9",
28
+ "phone": "c402f998",
29
+ "sent": "732237ac",
30
+ "social": "bcda6774",
31
+ "status": "f1b9c36f",
32
+ "test@presidio.site": "a295c5d0",
33
+ "valid": "08876c6f",
34
+ "wallet": "516361a1"
35
  }
utils_demo.py CHANGED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+
3
+ def process_tokens(tokens, inverse_uuid_map=None, uuid_map=None, embeddings_model=None, fhe_ner_detection=None, client=None):
4
+ """Processes tokens based on the provided parameters for either deanonymizing, anonymizing or default processing."""
5
+ processed_tokens = []
6
+ for token in tokens:
7
+ if not token.strip() or not re.match(r"\w+", token): # Directly append non-word tokens or whitespace
8
+ processed_tokens.append(token)
9
+ continue
10
+ if inverse_uuid_map is not None: # For deanonymizing response
11
+ processed_tokens.append(inverse_uuid_map.get(token, token))
12
+ elif uuid_map is not None and embeddings_model is not None and fhe_ner_detection is not None and client is not None: # For FHEAnonymizer call
13
+ x = embeddings_model.wv[token][None]
14
+ prediction_proba = fhe_ner_detection.predict_proba(x)
15
+ probability = prediction_proba[0][1]
16
+ if probability >= 0.5:
17
+ tmp_uuid = uuid_map.get(token, str(uuid.uuid4())[:8])
18
+ processed_tokens.append(tmp_uuid)
19
+ uuid_map[token] = tmp_uuid
20
+ else:
21
+ processed_tokens.append(token)
22
+ else:
23
+ processed_tokens.append(token)
24
+ return ''.join(processed_tokens)