Spaces:
Sleeping
Sleeping
"""A Gradio app for anonymizing text data using FHE.""" | |
import os | |
import re | |
from typing import Dict, List | |
import numpy | |
import gradio as gr | |
import pandas as pd | |
from fhe_anonymizer import FHEAnonymizer | |
from openai import OpenAI | |
from utils_demo import * | |
from concrete.ml.deployment import FHEModelClient | |
ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n") | |
ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH) | |
MAPPING_SENTENCES = read_pickle(MAPPING_SENTENCES_PATH) | |
subprocess.Popen(["uvicorn", "server:app"], cwd=CURRENT_DIR) | |
time.sleep(3) | |
clean_directory() | |
anonymizer = FHEAnonymizer() | |
client = OpenAI(api_key=os.environ.get("openaikey")) | |
# Generate a random user ID | |
user_id = numpy.random.randint(0, 2**32) | |
print(f"Your user ID is: {user_id}....") | |
def select_static_sentences_fn(selected_sentences: List): | |
selected_sentences = [MAPPING_SENTENCES[sentence] for sentence in selected_sentences] | |
anonymized_selected_sentence = sorted(selected_sentences, key=lambda x: x[0]) | |
anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence] | |
return {anonymized_doc_box: gr.update(value="\n\n".join(anonymized_selected_sentence))} | |
def key_gen_fn() -> Dict: | |
"""Generate keys for a given user. | |
Returns: | |
dict: A dictionary containing the generated keys and related information. | |
""" | |
print("Step 1: Key Generation:") | |
client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{user_id}") | |
client.load() | |
# Creates the private and evaluation keys on the client side | |
client.generate_private_and_evaluation_keys() | |
# Get the serialized evaluation keys | |
serialized_evaluation_keys = client.get_serialized_evaluation_keys() | |
assert isinstance(serialized_evaluation_keys, bytes) | |
# Save the evaluation key | |
evaluation_key_path = KEYS_DIR / f"{user_id}/evaluation_key" | |
with evaluation_key_path.open("wb") as f: | |
f.write(serialized_evaluation_keys) | |
# anonymizer.generate_key() | |
if not evaluation_key_path.is_file(): | |
error_message = ( | |
f"Error Encountered While generating the evaluation {evaluation_key_path.is_file()=}" | |
) | |
print(error_message) | |
return {gen_key_btn: gr.update(value=error_message)} | |
else: | |
return {gen_key_btn: gr.update(value="Keys have been generated ✅")} | |
def encrypt_query_fn(query): | |
print(f"Step 2 Query encryption: {query=}") | |
evaluation_key_path = KEYS_DIR / f"{user_id}/evaluation_key" | |
if not evaluation_key_path.is_file(): | |
error_message = "Error ❌: Please generate the key first!" | |
return {output_encrypted_box: gr.update(value=error_message)} | |
if is_user_query_valid(query): | |
error_msg = ( | |
"Unable to process ❌: The request exceeds the length limit or falls " | |
"outside the scope of this document. Please refine your query." | |
) | |
print(error_msg) | |
return {query_box: gr.update(value=error_msg)} | |
# Retrieve the client API | |
client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{user_id}") | |
client.load() | |
# Pattern to identify words and non-words (including punctuation, spaces, etc.) | |
tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", query) | |
encrypted_tokens = [] | |
for token in tokens: | |
if bool(re.match(r"^\s+$", token)): | |
continue | |
# Directly append non-word tokens or whitespace to processed_tokens | |
# Prediction for each word | |
emb_x = get_batch_text_representation([token], EMBEDDINGS_MODEL, TOKENIZER) | |
encrypted_x = client.quantize_encrypt_serialize(emb_x) | |
assert isinstance(encrypted_x, bytes) | |
encrypted_tokens.append(encrypted_x) | |
write_pickle(KEYS_DIR / f"{user_id}/encrypted_input", encrypted_tokens) | |
#anonymizer.encrypt_query(query) | |
encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens] | |
return {output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex))} | |
def run_fhe_fn(query_box): | |
evaluation_key_path = KEYS_DIR / "evaluation_key" | |
if not evaluation_key_path.is_file(): | |
error_message = "Error ❌: Please generate the key first!" | |
return {anonymized_text_output: gr.update(value=error_message)} | |
encryted_query_path = KEYS_DIR / "encrypted_quantized_query" | |
if not encryted_query_path.is_file(): | |
error_message = "Error ❌: Please encrypt your query first!" | |
return {anonymized_text_output: gr.update(value=error_message)} | |
anonymizer.run_server_and_decrypt_output(query_box) | |
anonymized_text = read_pickle(KEYS_DIR / "reconstructed_sentence") | |
# Removing Spaces Before Punctuation: | |
anonymized_text = re.sub(r"\s([,.!?;:])", r"\1", anonymized_text) | |
identified_words_with_prob = read_pickle(KEYS_DIR / "identified_words_with_prob") | |
# Convert the list of identified words and probabilities into a DataFrame | |
if identified_words_with_prob: | |
identified_df = pd.DataFrame( | |
identified_words_with_prob, columns=["Identified Words", "Probability"] | |
) | |
else: | |
identified_df = pd.DataFrame(columns=["Identified Words", "Probability"]) | |
return anonymized_text, identified_df | |
def query_chatgpt_fn(anonymized_query, anonymized_document): | |
evaluation_key_path = KEYS_DIR / "evaluation_key" | |
if not evaluation_key_path.is_file(): | |
error_message = "Error ❌: Please generate the key first!" | |
return {anonymized_text_output: gr.update(value=error_message)} | |
encryted_query_path = KEYS_DIR / "encrypted_quantized_query" | |
if not encryted_query_path.is_file(): | |
error_message = "Error ❌: Please encrypt your query first!" | |
return {anonymized_text_output: gr.update(value=error_message)} | |
decrypted_query_path = KEYS_DIR / "reconstructed_sentence" | |
if not decrypted_query_path.is_file(): | |
error_message = "Error ❌: Please run the FHE computation first!" | |
return {anonymized_text_output: gr.update(value=error_message)} | |
prompt = read_txt(PROMPT_PATH) | |
# Prepare prompt | |
full_prompt = prompt + "\n" | |
query = ( | |
"Document content:\n```\n" | |
+ anonymized_document | |
+ "\n\n```" | |
+ "Query:\n```\n" | |
+ anonymized_query | |
+ "\n```" | |
) | |
print(full_prompt) | |
completion = client.chat.completions.create( | |
model="gpt-4-1106-preview", # Replace with "gpt-4" if available | |
messages=[ | |
{"role": "system", "content": prompt}, | |
{"role": "user", "content": query}, | |
], | |
) | |
anonymized_response = completion.choices[0].message.content | |
uuid_map = read_json(MAPPING_UUID_PATH) | |
inverse_uuid_map = { | |
v: k for k, v in uuid_map.items() | |
} # TODO load the inverse mapping from disk for efficiency | |
# Pattern to identify words and non-words (including punctuation, spaces, etc.) | |
tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", anonymized_response) | |
processed_tokens = [] | |
for token in tokens: | |
# Directly append non-word tokens or whitespace to processed_tokens | |
if not token.strip() or not re.match(r"\w+", token): | |
processed_tokens.append(token) | |
continue | |
if token in inverse_uuid_map: | |
processed_tokens.append(inverse_uuid_map[token]) | |
else: | |
processed_tokens.append(token) | |
deanonymized_response = "".join(processed_tokens) | |
return anonymized_response, deanonymized_response | |
demo = gr.Blocks(css=".markdown-body { font-size: 18px; }") | |
with demo: | |
gr.Markdown( | |
""" | |
<p align="center"> | |
<img width=200 src="file/images/logos/zama.jpg"> | |
</p> | |
<h1 style="text-align: center;">Encrypted Anonymization Using Fully Homomorphic Encryption</h1> | |
<p align="center"> | |
<a href="https://github.com/zama-ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/github.png">Concrete-ML</a> | |
— | |
<a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/documentation.png">Documentation</a> | |
— | |
<a href=" https://community.zama.ai/c/concrete-ml/8"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/community.png">Community</a> | |
— | |
<a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/x.png">@zama_fhe</a> | |
</p> | |
""" | |
) | |
# gr.Markdown( | |
# """ | |
# <p align="center"> | |
# <img width="15%" height="15%" src="./encrypted_anonymization_diagram.jpg"> | |
# </p> | |
# """ | |
# ) | |
with gr.Accordion("What is encrypted anonymization?", open=False): | |
gr.Markdown( | |
""" | |
Anonymization is the process of removing personally identifiable information (PII) | |
from data to protect individual privacy. | |
To resolve trust issues when deploying anonymization as a cloud service, Fully Homomorphic | |
Encryption (FHE) can be used to preserve the privacy of the original data using | |
encryption. | |
The data remains encrypted throughout the anonymization process, eliminating the need for | |
third-party access to the raw data. Once the data is anonymized, it can safely be sent | |
to GenAI services such as ChatGPT. | |
""" | |
) | |
########################## Key Gen Part ########################## | |
gr.Markdown( | |
"## Step 1: Key generation\n\n" | |
"""In FHE schemes, two sets of keys are generated. First, the secret keys which are used for | |
encrypting and decrypting data owned by the client. Second, the evaluation keys that allow | |
a server to blindly process the encrypted data. | |
""" | |
) | |
gen_key_btn = gr.Button("Generate the secret and evaluation keys") | |
gen_key_btn.click( | |
key_gen_fn, | |
inputs=[], | |
outputs=[gen_key_btn], | |
) | |
########################## Main document Part ########################## | |
gr.Markdown("## Step 2: Private document") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("**Original document:**") | |
gr.Markdown( | |
"""This document was retrieved from the [Microsoft Presidio](https://huggingface.co/spaces/presidio/presidio_demo) demo.\n\n | |
You can select and deselect sentences to customize the document that will be used | |
as the initial prompt for ChatGPT in step 5. | |
""" | |
) | |
with gr.Column(): | |
gr.Markdown("**Anonymized document:**") | |
gr.Markdown( | |
"""You can see below the anonymized text, replaced with hexademical strings, that | |
will be sent to ChatGPT. | |
ChatGPT will then be able to answer any queries about the document. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
original_sentences_box = gr.CheckboxGroup( | |
ORIGINAL_DOCUMENT, value=ORIGINAL_DOCUMENT, show_label=False, | |
) | |
with gr.Column(): | |
anonymized_doc_box = gr.Textbox(show_label=False, | |
value=ANONYMIZED_DOCUMENT, interactive=False, lines=11 | |
) | |
original_sentences_box.change( | |
fn=select_static_sentences_fn, | |
inputs=[original_sentences_box], | |
outputs=[anonymized_doc_box], | |
) | |
########################## User Query Part ########################## | |
gr.Markdown("<hr />") | |
gr.Markdown("## Step 3: Private query") | |
gr.Markdown( | |
"""Now, you can formulate a query. Please choose from the predefined options in | |
“Queries examples” or craft a custom question in the “Customized query” text box. | |
Remain concise and relevant to the context. Any off-topic query will not be processed. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=5): | |
with gr.Column(scale=5): | |
default_query_box = gr.Dropdown( | |
list(DEFAULT_QUERIES.values()), label="Queries examples:" | |
) | |
gr.Markdown("Or") | |
query_box = gr.Textbox( | |
value="Who lives in Maine?", label="Customized query:", interactive=True | |
) | |
default_query_box.change( | |
fn=lambda default_query_box: default_query_box, | |
inputs=[default_query_box], | |
outputs=[query_box], | |
) | |
with gr.Column(scale=1, min_width=6): | |
gr.HTML("<div style='height: 25px;'></div>") | |
gr.Markdown( | |
""" | |
<p align="center"> | |
Encrypt the query locally with FHE | |
</p> | |
""" | |
) | |
encrypt_btn = gr.Button("Encrypt query”") | |
gr.HTML("<div style='height: 25px;'></div>") | |
with gr.Column(scale=5): | |
output_encrypted_box = gr.Textbox( | |
label="Encrypted anonymized query that will be sent to the anonymization server:", lines=8 | |
) | |
encrypt_btn.click( | |
fn=encrypt_query_fn, inputs=[query_box], outputs=[query_box, output_encrypted_box] | |
) | |
########################## FHE processing Part ########################## | |
gr.Markdown("<hr />") | |
gr.Markdown("## Step 4: Secure anonymization with FHE") | |
gr.Markdown( | |
""" Once the client encrypts the private query locally, it will be sent to a remote server | |
to perform the anonymization on encrypted data. When the computation is done, the server | |
will return the result to the client for decryption. | |
""" | |
) | |
run_fhe_btn = gr.Button("Anonymize with FHE") | |
anonymized_text_output = gr.Textbox( | |
label="Decrypted anonymized query that will be sent to ChatGPT:", lines=1, interactive=True | |
) | |
identified_words_output = gr.Dataframe(label="Identified words:", visible=False) | |
run_fhe_btn.click( | |
run_fhe_fn, | |
inputs=[query_box], | |
outputs=[anonymized_text_output, identified_words_output], | |
) | |
########################## ChatGpt Part ########################## | |
gr.Markdown("<hr />") | |
gr.Markdown("## Spet 5: Secure your communication on ChatGPT with anonymized queries") | |
gr.Markdown( | |
"""After securely anonymizing the query with FHE, | |
you can forward it to ChatGPT without having any concern about information leakage.""" | |
) | |
chatgpt_button = gr.Button("Query ChatGPT") | |
with gr.Row(): | |
chatgpt_response_anonymized = gr.Textbox(label="ChatGPT's anonymized response:", lines=13) | |
chatgpt_response_deanonymized = gr.Textbox( | |
label="ChatGPT's non-anonymized response:", lines=13 | |
) | |
chatgpt_button.click( | |
query_chatgpt_fn, | |
inputs=[anonymized_text_output, anonymized_doc_box], | |
outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized], | |
) | |
gr.Markdown( | |
"""**Please note**: As this space is intended solely for demonstration purposes, some | |
private information may be missed during by the anonymization algorithm. Please validate the | |
following query before sending it to ChatGPT.""" | |
) | |
# Launch the app | |
demo.launch(share=False) | |