Spaces:

zama-fhe
/

encrypted-anonymization

Running

App Files Files Community

kcelia commited on Apr 18

Commit

ce217e0

•

1 Parent(s): bc93019

chore: update Marketing v2

Browse files

Files changed (8) hide show

app.py +90 -59
files/anonymized_document.txt +5 -5
files/mapping_clear_to_anonymized.pkl +2 -2
files/mapping_clear_to_encrypted.pkl +2 -2
files/mapping_doc_embedding_path.pkl +3 -0
files/original_document.txt +2 -2
files/original_document_uuid_mapping.json +6 -8
utils_demo.py +4 -1

app.py CHANGED Viewed

@@ -35,6 +35,7 @@ ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
 MAPPING_ANONYMIZED_SENTENCES = read_pickle(MAPPING_ANONYMIZED_SENTENCES_PATH)
 MAPPING_ENCRYPTED_SENTENCES = read_pickle(MAPPING_ENCRYPTED_SENTENCES_PATH)
 ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
 print(ORIGINAL_DOCUMENT)
 # 4. Data Processing and Operations (No specific operations shown here, assuming it's part of anonymizer or client usage)
@@ -54,7 +55,7 @@ def select_static_anonymized_sentences_fn(selected_sentences: List):
     anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
-    return {anonymized_doc_box: gr.update(value="\n\n".join(anonymized_selected_sentence))}
 def key_gen_fn() -> Dict:
@@ -92,23 +93,48 @@ def key_gen_fn() -> Dict:
         print("Keys have been generated ✅")
         return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
-def select_static_encrypted_sentences_fn(selected_sentences: List):
-    selected_sentences = [MAPPING_ENCRYPTED_SENTENCES[sentence] for sentence in selected_sentences]
-    anonymized_selected_sentence = sorted(selected_sentences, key=lambda x: x[0])
-    anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
-    return {encrypted_doc_box: gr.update(value="\n\n".join(anonymized_selected_sentence))}
 def encrypt_query_fn(query):
     print(f"\n------------ Step 2: Query encryption: {query=}")
     if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
-        return {output_encrypted_box: gr.update(value="Error ❌: Please generate the key first!")}
     if is_user_query_valid(query):
         return {
@@ -156,8 +182,8 @@ def encrypt_query_fn(query):
     encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
     return {
-        output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=4),
-        anonymized_text_output: gr.update(visible=True, value=None),
         identified_words_output_df: gr.update(visible=False, value=None),
     }
@@ -176,14 +202,14 @@ def send_input_fn(query) -> Dict:
             "Error Encountered While Sending Data to the Server: "
             f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
         )
-        return {anonymized_text_output: gr.update(value=error_message)}
     if not encrypted_input_path.is_file():
         error_message = (
             "Error Encountered While Sending Data to the Server: The data has not been encrypted "
             f"correctly on the client side - {encrypted_input_path.is_file()=}"
         )
-        return {anonymized_text_output: gr.update(value=error_message)}
     # Define the data and files to post
     data = {"user_id": USER_ID, "input": query}
@@ -218,14 +244,14 @@ def run_fhe_in_server_fn() -> Dict:
             "Error Encountered While Sending Data to the Server: "
             f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
         )
-        return {anonymized_text_output: gr.update(value=error_message)}
     if not encrypted_input_path.is_file():
         error_message = (
             "Error Encountered While Sending Data to the Server: The data has not been encrypted "
             f"correctly on the client side - {encrypted_input_path.is_file()=}"
         )
-        return {anonymized_text_output: gr.update(value=error_message)}
     data = {
         "user_id": USER_ID,
@@ -239,7 +265,7 @@ def run_fhe_in_server_fn() -> Dict:
     ) as response:
         if not response.ok:
             return {
-                anonymized_text_output: gr.update(
                     value=(
                         "⚠️ An error occurred on the Server Side. "
                         "Please check connectivity and data transmission."
@@ -260,14 +286,14 @@ def get_output_fn() -> Dict:
             "Error Encountered While Sending Data to the Server: "
             "The key has not been generated correctly"
         )
-        return {anonymized_text_output: gr.update(value=error_message)}
     if not (KEYS_DIR / f"{USER_ID}/encrypted_input").is_file():
         error_message = (
             "Error Encountered While Sending Data to the Server: "
             "The data has not been encrypted correctly on the client side"
         )
-        return {anonymized_text_output: gr.update(value=error_message)}
     data = {
         "user_id": USER_ID,
@@ -372,7 +398,7 @@ def decrypt_fn(text) -> Dict:
     return anonymized_text, identified_df
-def anonymization_with_fn(query):
     encrypt_query_fn(query)
@@ -385,8 +411,9 @@ def anonymization_with_fn(query):
     anonymized_text, identified_df = decrypt_fn(query)
     return {
-        anonymized_text_output: gr.update(value=anonymized_text),
-        identified_words_output_df: gr.update(value=identified_df, visible=True),
     }
@@ -402,10 +429,9 @@ def query_chatgpt_fn(anonymized_query, anonymized_document):
         error_message = "Error ❌: Please encrypt your query first!"
         return {chatgpt_response_anonymized: gr.update(value=error_message)}
-    prompt = read_txt(PROMPT_PATH)
     # Prepare prompt
-    initial_prompt = prompt + "\n"
     query = (
         "Document content:\n```\n"
         + anonymized_document
@@ -414,12 +440,12 @@ def query_chatgpt_fn(anonymized_query, anonymized_document):
         + anonymized_query
         + "\n```"
     )
-    print(f'initial_prompt:\n{initial_prompt}')
     completion = client.chat.completions.create(
         model="gpt-4-1106-preview",  # Replace with "gpt-4" if available
         messages=[
-            {"role": "system", "content": prompt},
             {"role": "user", "content": query},
         ],
     )
@@ -472,26 +498,31 @@ with demo:
         """
     )
-    # gr.Markdown(
-    #     """
-    #     <p align="center">
-    #         <img width="15%" height="15%" src="./encrypted_anonymization_diagram.jpg">
-    #     </p>
-    #     """
-    # )
-    with gr.Accordion("What is encrypted anonymization?", open=False):
-        gr.Markdown(
-        """Anonymization is the process of removing personally identifiable information (PII) data
-        from a document in order to protect individual privacy.
-        Encrypted anonymization using Fully Homomorphic Encryption (FHE) solves issues when
-        deploying such tool through an untrusted cloud service, as Fully Homomorphic Encryption
-        (FHE) allows such services to anonymize personally identifiable information (PII) on an
-        encrypted document. Once the data is anonymized, it can safely be sent to LLM services such
-        as ChatGPT.
         """
-        )
     ########################## Key Gen Part ##########################
@@ -535,16 +566,10 @@ with demo:
             encrypt_doc_btn = gr.Button("Encrypt the document")
         with gr.Column(scale=5):
-            anonymized_doc_box = gr.Textbox(
-                label="Encrypted document:",
-                show_label=True, value=ANONYMIZED_DOCUMENT, interactive=False, lines=11
             )
-    original_sentences_box.change(
-        fn=select_static_anonymized_sentences_fn,
-        inputs=[original_sentences_box],
-        outputs=[anonymized_doc_box],
-    )
     ########################## User Query Part ##########################
@@ -577,7 +602,7 @@ with demo:
         with gr.Column(scale=1, min_width=6):
             gr.HTML("<div style='height: 77px;'></div>")
-            encrypt_btn = gr.Button("Encrypt the prompt")
             # gr.HTML("<div style='height: 50px;'></div>")
         with gr.Column(scale=5):
@@ -602,34 +627,40 @@ with demo:
     with gr.Row():
         with gr.Column(scale=5):
-            anonymized_text_output = gr.Textbox(
-                label="Decrypted and anonymized document", lines=5, interactive=True
             )
         with gr.Column(scale=5):
             anonymized_query_output = gr.Textbox(
-                label="Decrypted and anonymized prompt", lines=5, interactive=True
             )
     identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
-    encrypt_btn.click(
         fn=encrypt_query_fn,
         inputs=[query_box],
         outputs=[
             query_box,
             output_encrypted_box,
-            anonymized_text_output,
             identified_words_output_df,
         ],
     )
     run_fhe_btn.click(
         anonymization_with_fn,
-        inputs=[query_box],
-        outputs=[anonymized_text_output, identified_words_output_df],
     )
     ########################## ChatGpt Part ##########################
@@ -651,7 +682,7 @@ with demo:
     chatgpt_button.click(
         query_chatgpt_fn,
-        inputs=[anonymized_text_output, anonymized_doc_box],
         outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized],
     )

 MAPPING_ANONYMIZED_SENTENCES = read_pickle(MAPPING_ANONYMIZED_SENTENCES_PATH)
 MAPPING_ENCRYPTED_SENTENCES = read_pickle(MAPPING_ENCRYPTED_SENTENCES_PATH)
 ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
+MAPPING_DOC_EMBEDDING = read_pickle(MAPPING_DOC_EMBEDDING_PATH)
 print(ORIGINAL_DOCUMENT)
 # 4. Data Processing and Operations (No specific operations shown here, assuming it's part of anonymizer or client usage)
     anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
+    return "\n\n".join(anonymized_selected_sentence)
 def key_gen_fn() -> Dict:
         print("Keys have been generated ✅")
         return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
+def encrypt_doc_fn(doc):
+    print(f"\n------------ Step 2.1: Doc encryption: {doc=}")
+    if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
+        return {encrypted_doc_box: gr.update(value="Error ❌: Please generate the key first!", lines=10)}
+    # Retrieve the client API
+    client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
+    client.load()
+    encrypted_tokens = []
+    tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", ' '.join(doc))
+    for token in tokens:
+        if token.strip() and re.match(r"\w+", token):
+            emb_x = MAPPING_DOC_EMBEDDING[token]
+            assert emb_x.shape == (1, 1024)
+            encrypted_x = client.quantize_encrypt_serialize(emb_x)
+            assert isinstance(encrypted_x, bytes)
+            encrypted_tokens.append(encrypted_x)
+    print("Doc encrypted ✅ on Client Side")
+    # No need to save it
+    # write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_doc", b"".join(encrypted_tokens))
+    encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens]
+    return {
+        encrypted_doc_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=10),
+        anonymized_doc_output: gr.update(visible=True, value=None),
+    }
 def encrypt_query_fn(query):
     print(f"\n------------ Step 2: Query encryption: {query=}")
     if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
+        return {output_encrypted_box: gr.update(value="Error ❌: Please generate the key first!", lines=8)}
     if is_user_query_valid(query):
         return {
     encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
     return {
+        output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=8),
+        anonymized_query_output: gr.update(visible=True, value=None),
         identified_words_output_df: gr.update(visible=False, value=None),
     }
             "Error Encountered While Sending Data to the Server: "
             f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
         )
+        return {anonymized_query_output: gr.update(value=error_message)}
     if not encrypted_input_path.is_file():
         error_message = (
             "Error Encountered While Sending Data to the Server: The data has not been encrypted "
             f"correctly on the client side - {encrypted_input_path.is_file()=}"
         )
+        return {anonymized_query_output: gr.update(value=error_message)}
     # Define the data and files to post
     data = {"user_id": USER_ID, "input": query}
             "Error Encountered While Sending Data to the Server: "
             f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
         )
+        return {anonymized_query_output: gr.update(value=error_message)}
     if not encrypted_input_path.is_file():
         error_message = (
             "Error Encountered While Sending Data to the Server: The data has not been encrypted "
             f"correctly on the client side - {encrypted_input_path.is_file()=}"
         )
+        return {anonymized_query_output: gr.update(value=error_message)}
     data = {
         "user_id": USER_ID,
     ) as response:
         if not response.ok:
             return {
+                anonymized_query_output: gr.update(
                     value=(
                         "⚠️ An error occurred on the Server Side. "
                         "Please check connectivity and data transmission."
             "Error Encountered While Sending Data to the Server: "
             "The key has not been generated correctly"
         )
+        return {anonymized_query_output: gr.update(value=error_message)}
     if not (KEYS_DIR / f"{USER_ID}/encrypted_input").is_file():
         error_message = (
             "Error Encountered While Sending Data to the Server: "
             "The data has not been encrypted correctly on the client side"
         )
+        return {anonymized_query_output: gr.update(value=error_message)}
     data = {
         "user_id": USER_ID,
     return anonymized_text, identified_df
+def anonymization_with_fn(selected_sentences, query):
     encrypt_query_fn(query)
     anonymized_text, identified_df = decrypt_fn(query)
     return {
+        anonymized_doc_output: gr.update(value=select_static_anonymized_sentences_fn(selected_sentences)),
+        anonymized_query_output: gr.update(value=anonymized_text),
+        identified_words_output_df: gr.update(value=identified_df, visible=False),
     }
         error_message = "Error ❌: Please encrypt your query first!"
         return {chatgpt_response_anonymized: gr.update(value=error_message)}
+    context_prompt = read_txt(PROMPT_PATH)
     # Prepare prompt
     query = (
         "Document content:\n```\n"
         + anonymized_document
         + anonymized_query
         + "\n```"
     )
+    print(f'Prompt of CHATGPT:\n{query}')
     completion = client.chat.completions.create(
         model="gpt-4-1106-preview",  # Replace with "gpt-4" if available
         messages=[
+            {"role": "system", "content": context_prompt},
             {"role": "user", "content": query},
         ],
     )
         """
     )
+    gr.Markdown(
+    """
+    <p align="center" style="font-size: 16px;">
+        Anonymization is the process of removing personally identifiable information (PII) data from
+        a document in order to protect individual privacy.</p>
+    <p align="center" style="font-size: 16px;">
+        Encrypted anonymization uses Fully Homomorphic Encryption (FHE) to anonymize personally
+        identifiable information (PII) within encrypted documents, enabling computations to be
+        performed on the encrypted data.</p>
+    <p align="center" style="font-size: 16px;">
+        In the example above, we're showing how encrypted anonymization can be leveraged to use LLM
+        services such as ChaGPT in a privacy-preserving manner.</p>
+    """
+    )
+    gr.Markdown(
         """
+        <p align="center">
+            <img width="75%" height="30%" src="https://raw.githubusercontent.com/kcelia/Img/main/fhe_anonymization_banner.png">
+        </p>
+        """
+    )
     ########################## Key Gen Part ##########################
             encrypt_doc_btn = gr.Button("Encrypt the document")
         with gr.Column(scale=5):
+            encrypted_doc_box = gr.Textbox(
+                label="Encrypted document:", show_label=True, interactive=False, lines=10
             )
     ########################## User Query Part ##########################
         with gr.Column(scale=1, min_width=6):
             gr.HTML("<div style='height: 77px;'></div>")
+            encrypt_query_btn = gr.Button("Encrypt the prompt")
             # gr.HTML("<div style='height: 50px;'></div>")
         with gr.Column(scale=5):
     with gr.Row():
         with gr.Column(scale=5):
+            anonymized_doc_output = gr.Textbox(
+                label="Decrypted and anonymized document", lines=10, interactive=True
             )
         with gr.Column(scale=5):
             anonymized_query_output = gr.Textbox(
+                label="Decrypted and anonymized prompt", lines=10, interactive=True
             )
     identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
+    encrypt_doc_btn.click(
+        fn=encrypt_doc_fn,
+        inputs=[original_sentences_box],
+        outputs=[encrypted_doc_box, anonymized_doc_output],
+    )
+    encrypt_query_btn.click(
         fn=encrypt_query_fn,
         inputs=[query_box],
         outputs=[
             query_box,
             output_encrypted_box,
+            anonymized_query_output,
             identified_words_output_df,
         ],
     )
     run_fhe_btn.click(
         anonymization_with_fn,
+        inputs=[original_sentences_box, query_box],
+        outputs=[anonymized_doc_output, anonymized_query_output, identified_words_output_df],
     )
     ########################## ChatGpt Part ##########################
     chatgpt_button.click(
         query_chatgpt_fn,
+        inputs=[anonymized_query_output, anonymized_doc_output],
         outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized],
     )

files/anonymized_document.txt CHANGED Viewed

@@ -1,11 +1,11 @@
-Members: e3383f5b 70fc6ec5 and 2708cb61 cda521d5
-Date: e381418b 3534158a, 96c403e5
-Scope: 2708cb61 agrees to provide graphic design services to e3383f5b for the creation of a company logo.
-Amount: Bob agrees to pay 2708cb61 500 upon completion and delivery of the logo.
 Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
-Payment terms: 2708cb61s international bank account N: 61294a43

+Members: a5989a5c and 20f545cf
+Date: 7bbd0258 28ebebcd, 87a7f982
+Scope: 20f545cf agrees to provide graphic design services to a5989a5c for the creation of a company logo.
+Amount: Bob agrees to pay 20f545cf 500 upon completion and delivery of the logo.
 Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
+Payment terms: 20f545cf's international bank account N: 43a4c5f3

files/mapping_clear_to_anonymized.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ce0f400a2f644ddf99bcbc76f856afc1ad79055b1f01133a69e7617d257de98c
-size 943

 version https://git-lfs.github.com/spec/v1
+oid sha256:aed1a1360ae82291357e5de8369d63d5514d90114743d1845b32642df9086902
+size 906

files/mapping_clear_to_encrypted.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b6dd8b3345ee3417bd83f4141007fc31211f30aaba1ceac3b847c8d525f1913f
-size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:45e4ba890f0b8c8d239534f9c6c1d0878f5419b62af6b32d9d7e758a0490ea8a
+size 916

files/mapping_doc_embedding_path.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:faa0f74bc4358424e29118dc9714512f092d83756a77d596dd9ce56c9555b444
+size 211319

files/original_document.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-Members: David Johnson and Kate Hemingway
 Date: February 06, 2000
@@ -8,4 +8,4 @@ Amount: Bob agrees to pay Kate $500 upon completion and delivery of the logo.
 Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
-Payment terms: Kate’s international bank account N°: IL150120690000003111111

+Members: David and Kate
 Date: February 06, 2000
 Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
+Payment terms: Kate's international bank account N°: IL150120690000003111111

files/original_document_uuid_mapping.json CHANGED Viewed

@@ -1,10 +1,8 @@
 {
-    "06": "3534158a",
-    "2000": "96c403e5",
-    "David": "e3383f5b",
-    "February": "e381418b",
-    "Hemingway": "cda521d5",
-    "IL150120690000003111111": "61294a43",
-    "Johnson": "70fc6ec5",
-    "Kate": "2708cb61"
 }

 {
+    "06": "28ebebcd",
+    "2000": "87a7f982",
+    "David": "a5989a5c",
+    "February": "7bbd0258",
+    "IL150120690000003111111": "43a4c5f3",
+    "Kate": "20f545cf"
 }

utils_demo.py CHANGED Viewed

@@ -40,6 +40,8 @@ ANONYMIZED_FILE_PATH = DATA_PATH / "anonymized_document.txt"
 MAPPING_UUID_PATH = DATA_PATH / "original_document_uuid_mapping.json"
 MAPPING_ANONYMIZED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_anonymized.pkl"
 MAPPING_ENCRYPTED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_encrypted.pkl"
 PROMPT_PATH = DATA_PATH / "chatgpt_prompt.txt"
@@ -57,7 +59,8 @@ EMBEDDINGS_MODEL = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
 PUNCTUATION_LIST = list(string.punctuation)
 PUNCTUATION_LIST.remove("%")
 PUNCTUATION_LIST.remove("$")
-PUNCTUATION_LIST = "".join(PUNCTUATION_LIST)
 def clean_directory() -> None:

 MAPPING_UUID_PATH = DATA_PATH / "original_document_uuid_mapping.json"
 MAPPING_ANONYMIZED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_anonymized.pkl"
 MAPPING_ENCRYPTED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_encrypted.pkl"
+MAPPING_DOC_EMBEDDING_PATH = DATA_PATH / "mapping_doc_embedding_path.pkl"
 PROMPT_PATH = DATA_PATH / "chatgpt_prompt.txt"
 PUNCTUATION_LIST = list(string.punctuation)
 PUNCTUATION_LIST.remove("%")
 PUNCTUATION_LIST.remove("$")
+PUNCTUATION_LIST = "".join(PUNCTUATION_LIST) + '°'
+print(f'{PUNCTUATION_LIST=}')
 def clean_directory() -> None: