kcelia commited on
Commit
ce217e0
1 Parent(s): bc93019

chore: update Marketing v2

Browse files
app.py CHANGED
@@ -35,6 +35,7 @@ ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
35
  MAPPING_ANONYMIZED_SENTENCES = read_pickle(MAPPING_ANONYMIZED_SENTENCES_PATH)
36
  MAPPING_ENCRYPTED_SENTENCES = read_pickle(MAPPING_ENCRYPTED_SENTENCES_PATH)
37
  ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
 
38
  print(ORIGINAL_DOCUMENT)
39
 
40
  # 4. Data Processing and Operations (No specific operations shown here, assuming it's part of anonymizer or client usage)
@@ -54,7 +55,7 @@ def select_static_anonymized_sentences_fn(selected_sentences: List):
54
 
55
  anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
56
 
57
- return {anonymized_doc_box: gr.update(value="\n\n".join(anonymized_selected_sentence))}
58
 
59
 
60
  def key_gen_fn() -> Dict:
@@ -92,23 +93,48 @@ def key_gen_fn() -> Dict:
92
  print("Keys have been generated ✅")
93
  return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
94
 
95
- def select_static_encrypted_sentences_fn(selected_sentences: List):
96
 
97
- selected_sentences = [MAPPING_ENCRYPTED_SENTENCES[sentence] for sentence in selected_sentences]
98
 
99
- anonymized_selected_sentence = sorted(selected_sentences, key=lambda x: x[0])
100
 
101
- anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
 
 
 
 
 
102
 
103
- return {encrypted_doc_box: gr.update(value="\n\n".join(anonymized_selected_sentence))}
 
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  def encrypt_query_fn(query):
107
 
108
  print(f"\n------------ Step 2: Query encryption: {query=}")
109
 
110
  if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
111
- return {output_encrypted_box: gr.update(value="Error ❌: Please generate the key first!")}
112
 
113
  if is_user_query_valid(query):
114
  return {
@@ -156,8 +182,8 @@ def encrypt_query_fn(query):
156
  encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
157
 
158
  return {
159
- output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=4),
160
- anonymized_text_output: gr.update(visible=True, value=None),
161
  identified_words_output_df: gr.update(visible=False, value=None),
162
  }
163
 
@@ -176,14 +202,14 @@ def send_input_fn(query) -> Dict:
176
  "Error Encountered While Sending Data to the Server: "
177
  f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
178
  )
179
- return {anonymized_text_output: gr.update(value=error_message)}
180
 
181
  if not encrypted_input_path.is_file():
182
  error_message = (
183
  "Error Encountered While Sending Data to the Server: The data has not been encrypted "
184
  f"correctly on the client side - {encrypted_input_path.is_file()=}"
185
  )
186
- return {anonymized_text_output: gr.update(value=error_message)}
187
 
188
  # Define the data and files to post
189
  data = {"user_id": USER_ID, "input": query}
@@ -218,14 +244,14 @@ def run_fhe_in_server_fn() -> Dict:
218
  "Error Encountered While Sending Data to the Server: "
219
  f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
220
  )
221
- return {anonymized_text_output: gr.update(value=error_message)}
222
 
223
  if not encrypted_input_path.is_file():
224
  error_message = (
225
  "Error Encountered While Sending Data to the Server: The data has not been encrypted "
226
  f"correctly on the client side - {encrypted_input_path.is_file()=}"
227
  )
228
- return {anonymized_text_output: gr.update(value=error_message)}
229
 
230
  data = {
231
  "user_id": USER_ID,
@@ -239,7 +265,7 @@ def run_fhe_in_server_fn() -> Dict:
239
  ) as response:
240
  if not response.ok:
241
  return {
242
- anonymized_text_output: gr.update(
243
  value=(
244
  "⚠️ An error occurred on the Server Side. "
245
  "Please check connectivity and data transmission."
@@ -260,14 +286,14 @@ def get_output_fn() -> Dict:
260
  "Error Encountered While Sending Data to the Server: "
261
  "The key has not been generated correctly"
262
  )
263
- return {anonymized_text_output: gr.update(value=error_message)}
264
 
265
  if not (KEYS_DIR / f"{USER_ID}/encrypted_input").is_file():
266
  error_message = (
267
  "Error Encountered While Sending Data to the Server: "
268
  "The data has not been encrypted correctly on the client side"
269
  )
270
- return {anonymized_text_output: gr.update(value=error_message)}
271
 
272
  data = {
273
  "user_id": USER_ID,
@@ -372,7 +398,7 @@ def decrypt_fn(text) -> Dict:
372
  return anonymized_text, identified_df
373
 
374
 
375
- def anonymization_with_fn(query):
376
 
377
  encrypt_query_fn(query)
378
 
@@ -385,8 +411,9 @@ def anonymization_with_fn(query):
385
  anonymized_text, identified_df = decrypt_fn(query)
386
 
387
  return {
388
- anonymized_text_output: gr.update(value=anonymized_text),
389
- identified_words_output_df: gr.update(value=identified_df, visible=True),
 
390
  }
391
 
392
 
@@ -402,10 +429,9 @@ def query_chatgpt_fn(anonymized_query, anonymized_document):
402
  error_message = "Error ❌: Please encrypt your query first!"
403
  return {chatgpt_response_anonymized: gr.update(value=error_message)}
404
 
405
- prompt = read_txt(PROMPT_PATH)
406
 
407
  # Prepare prompt
408
- initial_prompt = prompt + "\n"
409
  query = (
410
  "Document content:\n```\n"
411
  + anonymized_document
@@ -414,12 +440,12 @@ def query_chatgpt_fn(anonymized_query, anonymized_document):
414
  + anonymized_query
415
  + "\n```"
416
  )
417
- print(f'initial_prompt:\n{initial_prompt}')
418
 
419
  completion = client.chat.completions.create(
420
  model="gpt-4-1106-preview", # Replace with "gpt-4" if available
421
  messages=[
422
- {"role": "system", "content": prompt},
423
  {"role": "user", "content": query},
424
  ],
425
  )
@@ -472,26 +498,31 @@ with demo:
472
  """
473
  )
474
 
475
- # gr.Markdown(
476
- # """
477
- # <p align="center">
478
- # <img width="15%" height="15%" src="./encrypted_anonymization_diagram.jpg">
479
- # </p>
480
- # """
481
- # )
482
-
483
- with gr.Accordion("What is encrypted anonymization?", open=False):
484
- gr.Markdown(
485
- """Anonymization is the process of removing personally identifiable information (PII) data
486
- from a document in order to protect individual privacy.
487
-
488
- Encrypted anonymization using Fully Homomorphic Encryption (FHE) solves issues when
489
- deploying such tool through an untrusted cloud service, as Fully Homomorphic Encryption
490
- (FHE) allows such services to anonymize personally identifiable information (PII) on an
491
- encrypted document. Once the data is anonymized, it can safely be sent to LLM services such
492
- as ChatGPT.
493
  """
494
- )
 
 
 
 
 
495
 
496
  ########################## Key Gen Part ##########################
497
 
@@ -535,16 +566,10 @@ with demo:
535
  encrypt_doc_btn = gr.Button("Encrypt the document")
536
 
537
  with gr.Column(scale=5):
538
- anonymized_doc_box = gr.Textbox(
539
- label="Encrypted document:",
540
- show_label=True, value=ANONYMIZED_DOCUMENT, interactive=False, lines=11
541
  )
542
 
543
- original_sentences_box.change(
544
- fn=select_static_anonymized_sentences_fn,
545
- inputs=[original_sentences_box],
546
- outputs=[anonymized_doc_box],
547
- )
548
 
549
  ########################## User Query Part ##########################
550
 
@@ -577,7 +602,7 @@ with demo:
577
 
578
  with gr.Column(scale=1, min_width=6):
579
  gr.HTML("<div style='height: 77px;'></div>")
580
- encrypt_btn = gr.Button("Encrypt the prompt")
581
  # gr.HTML("<div style='height: 50px;'></div>")
582
 
583
  with gr.Column(scale=5):
@@ -602,34 +627,40 @@ with demo:
602
  with gr.Row():
603
  with gr.Column(scale=5):
604
 
605
- anonymized_text_output = gr.Textbox(
606
- label="Decrypted and anonymized document", lines=5, interactive=True
607
  )
608
 
609
  with gr.Column(scale=5):
610
 
611
  anonymized_query_output = gr.Textbox(
612
- label="Decrypted and anonymized prompt", lines=5, interactive=True
613
  )
614
 
615
 
616
  identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
617
 
618
- encrypt_btn.click(
 
 
 
 
 
 
619
  fn=encrypt_query_fn,
620
  inputs=[query_box],
621
  outputs=[
622
  query_box,
623
  output_encrypted_box,
624
- anonymized_text_output,
625
  identified_words_output_df,
626
  ],
627
  )
628
 
629
  run_fhe_btn.click(
630
  anonymization_with_fn,
631
- inputs=[query_box],
632
- outputs=[anonymized_text_output, identified_words_output_df],
633
  )
634
 
635
  ########################## ChatGpt Part ##########################
@@ -651,7 +682,7 @@ with demo:
651
 
652
  chatgpt_button.click(
653
  query_chatgpt_fn,
654
- inputs=[anonymized_text_output, anonymized_doc_box],
655
  outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized],
656
  )
657
 
 
35
  MAPPING_ANONYMIZED_SENTENCES = read_pickle(MAPPING_ANONYMIZED_SENTENCES_PATH)
36
  MAPPING_ENCRYPTED_SENTENCES = read_pickle(MAPPING_ENCRYPTED_SENTENCES_PATH)
37
  ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
38
+ MAPPING_DOC_EMBEDDING = read_pickle(MAPPING_DOC_EMBEDDING_PATH)
39
  print(ORIGINAL_DOCUMENT)
40
 
41
  # 4. Data Processing and Operations (No specific operations shown here, assuming it's part of anonymizer or client usage)
 
55
 
56
  anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
57
 
58
+ return "\n\n".join(anonymized_selected_sentence)
59
 
60
 
61
  def key_gen_fn() -> Dict:
 
93
  print("Keys have been generated ✅")
94
  return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
95
 
 
96
 
97
+ def encrypt_doc_fn(doc):
98
 
99
+ print(f"\n------------ Step 2.1: Doc encryption: {doc=}")
100
 
101
+ if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
102
+ return {encrypted_doc_box: gr.update(value="Error ❌: Please generate the key first!", lines=10)}
103
+
104
+ # Retrieve the client API
105
+ client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
106
+ client.load()
107
 
108
+ encrypted_tokens = []
109
+ tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", ' '.join(doc))
110
 
111
+ for token in tokens:
112
+ if token.strip() and re.match(r"\w+", token):
113
+ emb_x = MAPPING_DOC_EMBEDDING[token]
114
+ assert emb_x.shape == (1, 1024)
115
+ encrypted_x = client.quantize_encrypt_serialize(emb_x)
116
+ assert isinstance(encrypted_x, bytes)
117
+ encrypted_tokens.append(encrypted_x)
118
+
119
+ print("Doc encrypted ✅ on Client Side")
120
+
121
+ # No need to save it
122
+ # write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_doc", b"".join(encrypted_tokens))
123
+
124
+ encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens]
125
+
126
+ return {
127
+ encrypted_doc_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=10),
128
+ anonymized_doc_output: gr.update(visible=True, value=None),
129
+ }
130
+
131
 
132
  def encrypt_query_fn(query):
133
 
134
  print(f"\n------------ Step 2: Query encryption: {query=}")
135
 
136
  if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
137
+ return {output_encrypted_box: gr.update(value="Error ❌: Please generate the key first!", lines=8)}
138
 
139
  if is_user_query_valid(query):
140
  return {
 
182
  encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
183
 
184
  return {
185
+ output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=8),
186
+ anonymized_query_output: gr.update(visible=True, value=None),
187
  identified_words_output_df: gr.update(visible=False, value=None),
188
  }
189
 
 
202
  "Error Encountered While Sending Data to the Server: "
203
  f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
204
  )
205
+ return {anonymized_query_output: gr.update(value=error_message)}
206
 
207
  if not encrypted_input_path.is_file():
208
  error_message = (
209
  "Error Encountered While Sending Data to the Server: The data has not been encrypted "
210
  f"correctly on the client side - {encrypted_input_path.is_file()=}"
211
  )
212
+ return {anonymized_query_output: gr.update(value=error_message)}
213
 
214
  # Define the data and files to post
215
  data = {"user_id": USER_ID, "input": query}
 
244
  "Error Encountered While Sending Data to the Server: "
245
  f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
246
  )
247
+ return {anonymized_query_output: gr.update(value=error_message)}
248
 
249
  if not encrypted_input_path.is_file():
250
  error_message = (
251
  "Error Encountered While Sending Data to the Server: The data has not been encrypted "
252
  f"correctly on the client side - {encrypted_input_path.is_file()=}"
253
  )
254
+ return {anonymized_query_output: gr.update(value=error_message)}
255
 
256
  data = {
257
  "user_id": USER_ID,
 
265
  ) as response:
266
  if not response.ok:
267
  return {
268
+ anonymized_query_output: gr.update(
269
  value=(
270
  "⚠️ An error occurred on the Server Side. "
271
  "Please check connectivity and data transmission."
 
286
  "Error Encountered While Sending Data to the Server: "
287
  "The key has not been generated correctly"
288
  )
289
+ return {anonymized_query_output: gr.update(value=error_message)}
290
 
291
  if not (KEYS_DIR / f"{USER_ID}/encrypted_input").is_file():
292
  error_message = (
293
  "Error Encountered While Sending Data to the Server: "
294
  "The data has not been encrypted correctly on the client side"
295
  )
296
+ return {anonymized_query_output: gr.update(value=error_message)}
297
 
298
  data = {
299
  "user_id": USER_ID,
 
398
  return anonymized_text, identified_df
399
 
400
 
401
+ def anonymization_with_fn(selected_sentences, query):
402
 
403
  encrypt_query_fn(query)
404
 
 
411
  anonymized_text, identified_df = decrypt_fn(query)
412
 
413
  return {
414
+ anonymized_doc_output: gr.update(value=select_static_anonymized_sentences_fn(selected_sentences)),
415
+ anonymized_query_output: gr.update(value=anonymized_text),
416
+ identified_words_output_df: gr.update(value=identified_df, visible=False),
417
  }
418
 
419
 
 
429
  error_message = "Error ❌: Please encrypt your query first!"
430
  return {chatgpt_response_anonymized: gr.update(value=error_message)}
431
 
432
+ context_prompt = read_txt(PROMPT_PATH)
433
 
434
  # Prepare prompt
 
435
  query = (
436
  "Document content:\n```\n"
437
  + anonymized_document
 
440
  + anonymized_query
441
  + "\n```"
442
  )
443
+ print(f'Prompt of CHATGPT:\n{query}')
444
 
445
  completion = client.chat.completions.create(
446
  model="gpt-4-1106-preview", # Replace with "gpt-4" if available
447
  messages=[
448
+ {"role": "system", "content": context_prompt},
449
  {"role": "user", "content": query},
450
  ],
451
  )
 
498
  """
499
  )
500
 
501
+ gr.Markdown(
502
+ """
503
+ <p align="center" style="font-size: 16px;">
504
+ Anonymization is the process of removing personally identifiable information (PII) data from
505
+ a document in order to protect individual privacy.</p>
506
+
507
+ <p align="center" style="font-size: 16px;">
508
+ Encrypted anonymization uses Fully Homomorphic Encryption (FHE) to anonymize personally
509
+ identifiable information (PII) within encrypted documents, enabling computations to be
510
+ performed on the encrypted data.</p>
511
+
512
+ <p align="center" style="font-size: 16px;">
513
+ In the example above, we're showing how encrypted anonymization can be leveraged to use LLM
514
+ services such as ChaGPT in a privacy-preserving manner.</p>
515
+ """
516
+ )
517
+
518
+ gr.Markdown(
519
  """
520
+ <p align="center">
521
+ <img width="75%" height="30%" src="https://raw.githubusercontent.com/kcelia/Img/main/fhe_anonymization_banner.png">
522
+ </p>
523
+ """
524
+ )
525
+
526
 
527
  ########################## Key Gen Part ##########################
528
 
 
566
  encrypt_doc_btn = gr.Button("Encrypt the document")
567
 
568
  with gr.Column(scale=5):
569
+ encrypted_doc_box = gr.Textbox(
570
+ label="Encrypted document:", show_label=True, interactive=False, lines=10
 
571
  )
572
 
 
 
 
 
 
573
 
574
  ########################## User Query Part ##########################
575
 
 
602
 
603
  with gr.Column(scale=1, min_width=6):
604
  gr.HTML("<div style='height: 77px;'></div>")
605
+ encrypt_query_btn = gr.Button("Encrypt the prompt")
606
  # gr.HTML("<div style='height: 50px;'></div>")
607
 
608
  with gr.Column(scale=5):
 
627
  with gr.Row():
628
  with gr.Column(scale=5):
629
 
630
+ anonymized_doc_output = gr.Textbox(
631
+ label="Decrypted and anonymized document", lines=10, interactive=True
632
  )
633
 
634
  with gr.Column(scale=5):
635
 
636
  anonymized_query_output = gr.Textbox(
637
+ label="Decrypted and anonymized prompt", lines=10, interactive=True
638
  )
639
 
640
 
641
  identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
642
 
643
+ encrypt_doc_btn.click(
644
+ fn=encrypt_doc_fn,
645
+ inputs=[original_sentences_box],
646
+ outputs=[encrypted_doc_box, anonymized_doc_output],
647
+ )
648
+
649
+ encrypt_query_btn.click(
650
  fn=encrypt_query_fn,
651
  inputs=[query_box],
652
  outputs=[
653
  query_box,
654
  output_encrypted_box,
655
+ anonymized_query_output,
656
  identified_words_output_df,
657
  ],
658
  )
659
 
660
  run_fhe_btn.click(
661
  anonymization_with_fn,
662
+ inputs=[original_sentences_box, query_box],
663
+ outputs=[anonymized_doc_output, anonymized_query_output, identified_words_output_df],
664
  )
665
 
666
  ########################## ChatGpt Part ##########################
 
682
 
683
  chatgpt_button.click(
684
  query_chatgpt_fn,
685
+ inputs=[anonymized_query_output, anonymized_doc_output],
686
  outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized],
687
  )
688
 
files/anonymized_document.txt CHANGED
@@ -1,11 +1,11 @@
1
- Members: e3383f5b 70fc6ec5 and 2708cb61 cda521d5
2
 
3
- Date: e381418b 3534158a, 96c403e5
4
 
5
- Scope: 2708cb61 agrees to provide graphic design services to e3383f5b for the creation of a company logo.
6
 
7
- Amount: Bob agrees to pay 2708cb61 500 upon completion and delivery of the logo.
8
 
9
  Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
10
 
11
- Payment terms: 2708cb61s international bank account N: 61294a43
 
1
+ Members: a5989a5c and 20f545cf
2
 
3
+ Date: 7bbd0258 28ebebcd, 87a7f982
4
 
5
+ Scope: 20f545cf agrees to provide graphic design services to a5989a5c for the creation of a company logo.
6
 
7
+ Amount: Bob agrees to pay 20f545cf 500 upon completion and delivery of the logo.
8
 
9
  Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
10
 
11
+ Payment terms: 20f545cf's international bank account N: 43a4c5f3
files/mapping_clear_to_anonymized.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce0f400a2f644ddf99bcbc76f856afc1ad79055b1f01133a69e7617d257de98c
3
- size 943
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aed1a1360ae82291357e5de8369d63d5514d90114743d1845b32642df9086902
3
+ size 906
files/mapping_clear_to_encrypted.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6dd8b3345ee3417bd83f4141007fc31211f30aaba1ceac3b847c8d525f1913f
3
- size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45e4ba890f0b8c8d239534f9c6c1d0878f5419b62af6b32d9d7e758a0490ea8a
3
+ size 916
files/mapping_doc_embedding_path.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faa0f74bc4358424e29118dc9714512f092d83756a77d596dd9ce56c9555b444
3
+ size 211319
files/original_document.txt CHANGED
@@ -1,4 +1,4 @@
1
- Members: David Johnson and Kate Hemingway
2
 
3
  Date: February 06, 2000
4
 
@@ -8,4 +8,4 @@ Amount: Bob agrees to pay Kate $500 upon completion and delivery of the logo.
8
 
9
  Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
10
 
11
- Payment terms: Kates international bank account N°: IL150120690000003111111
 
1
+ Members: David and Kate
2
 
3
  Date: February 06, 2000
4
 
 
8
 
9
  Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
10
 
11
+ Payment terms: Kate's international bank account N°: IL150120690000003111111
files/original_document_uuid_mapping.json CHANGED
@@ -1,10 +1,8 @@
1
  {
2
- "06": "3534158a",
3
- "2000": "96c403e5",
4
- "David": "e3383f5b",
5
- "February": "e381418b",
6
- "Hemingway": "cda521d5",
7
- "IL150120690000003111111": "61294a43",
8
- "Johnson": "70fc6ec5",
9
- "Kate": "2708cb61"
10
  }
 
1
  {
2
+ "06": "28ebebcd",
3
+ "2000": "87a7f982",
4
+ "David": "a5989a5c",
5
+ "February": "7bbd0258",
6
+ "IL150120690000003111111": "43a4c5f3",
7
+ "Kate": "20f545cf"
 
 
8
  }
utils_demo.py CHANGED
@@ -40,6 +40,8 @@ ANONYMIZED_FILE_PATH = DATA_PATH / "anonymized_document.txt"
40
  MAPPING_UUID_PATH = DATA_PATH / "original_document_uuid_mapping.json"
41
  MAPPING_ANONYMIZED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_anonymized.pkl"
42
  MAPPING_ENCRYPTED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_encrypted.pkl"
 
 
43
  PROMPT_PATH = DATA_PATH / "chatgpt_prompt.txt"
44
 
45
 
@@ -57,7 +59,8 @@ EMBEDDINGS_MODEL = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
57
  PUNCTUATION_LIST = list(string.punctuation)
58
  PUNCTUATION_LIST.remove("%")
59
  PUNCTUATION_LIST.remove("$")
60
- PUNCTUATION_LIST = "".join(PUNCTUATION_LIST)
 
61
 
62
 
63
  def clean_directory() -> None:
 
40
  MAPPING_UUID_PATH = DATA_PATH / "original_document_uuid_mapping.json"
41
  MAPPING_ANONYMIZED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_anonymized.pkl"
42
  MAPPING_ENCRYPTED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_encrypted.pkl"
43
+ MAPPING_DOC_EMBEDDING_PATH = DATA_PATH / "mapping_doc_embedding_path.pkl"
44
+
45
  PROMPT_PATH = DATA_PATH / "chatgpt_prompt.txt"
46
 
47
 
 
59
  PUNCTUATION_LIST = list(string.punctuation)
60
  PUNCTUATION_LIST.remove("%")
61
  PUNCTUATION_LIST.remove("$")
62
+ PUNCTUATION_LIST = "".join(PUNCTUATION_LIST) + '°'
63
+ print(f'{PUNCTUATION_LIST=}')
64
 
65
 
66
  def clean_directory() -> None: