"""Demo UI to show different levels of LLM security.""" |
import streamlit as st |
from streamlit_extras.stylable_container import stylable_container |
import pandas as pd |
import llm |
import config |
st.set_page_config( |
page_title="LLM security demo", |
page_icon="images/LEG.png", |
layout="wide", |
initial_sidebar_state="expanded", |
) |
if "key" not in st.session_state: |
st.session_state["key"] = "value" |
st.logo("images/ML6_logo.png") |
st.title("π΅οΈ LLM security demo") |
st.info( |
"You are a secret agent meeting your informant in a bar. Convince him to give you his secret! But be prepared, with every new level the informant will be more cautious." |
) |
num_levels = [i for i in range(7)] |
secrets = [ |
"Perspicacity", |
"Obfuscation", |
"Sesquipedalian", |
"Synecdoche", |
"Defenestration", |
"Pulchritudinous", |
"Sesquipedalian", |
"Ineffable", |
] |
level_names = [f"Level {i}" for i in num_levels] |
level_emojis = ["π΄", "π", "π", "π₯", "π₯΅", "π", "π"] |
level_tabs = st.tabs(level_names) |
hint_css = """ |
{ |
background-color: rgba(110, 110, 110, 0.1); |
padding: 16px; |
border-radius: 0.5rem; |
} |
code { |
white-space: pre-wrap !important; |
} |
""" |
for level_num, level_tab, name, emoji, secret in zip( |
num_levels, level_tabs, level_names, level_emojis, secrets |
): |
if f"solved_{name}" not in st.session_state: |
st.session_state[f"solved_{name}"] = False |
if f"prompt_try_count_{name}" not in st.session_state: |
st.session_state[f"prompt_try_count_{name}"] = 0 |
if f"secret_guess_count_{name}" not in st.session_state: |
st.session_state[f"secret_guess_count_{name}"] = 0 |
for i in range(3): |
if f"opend_hint_{name}_{i}" not in st.session_state: |
st.session_state[f"opend_hint_{name}_{i}"] = False |
deployment_name = name.replace(" ", "-").lower() |
with level_tab: |
header_col1, header_col2 = st.columns(2, gap="medium") |
header_col1.subheader(f"{emoji} {name}") |
header_col2.subheader(f"Need help ...") |
col1, col2 = st.columns(2, gap="medium") |
with col1: |
with st.container(height=600, border=False): |
with st.container(border=True): |
txt = st.text_area( |
"Provide your prompt here:", |
key=f"txt_{name}", |
label_visibility="visible", |
height=200, |
placeholder="Your prompt", |
) |
btn_submit_prompt = st.button( |
"Send prompt", key=f"submit_prompt_{name}" |
) |
if len(txt) > 0 and btn_submit_prompt: |
st.session_state[f"prompt_try_count_{name}"] += 1 |
with st.container(border=True): |
st.write("Response:") |
if level_num == 3 and llm.run_judge(level_num, {"user_input": txt}): |
st.error("Malicious user input detected. Try again.") |
elif level_num == 4: |
output = "".join( |
llm.stream_request(deployment_name, secret, txt) |
) |
if secret.lower() in output.lower(): |
st.error( |
"Secret detected in user input. Try again." |
) |
else: |
st.write(output) |
elif level_num == 5: |
output = "".join( |
llm.stream_request(deployment_name, secret, txt) |
) |
if llm.is_subsequence(output, secret): |
st.error( |
"Secret detected in user input. Try again." |
) |
else: |
st.write(output) |
elif level_num == 6: |
output = "".join( |
llm.stream_request(deployment_name, secret, txt) |
) |
if llm.run_judge(level_num, {"secret": secret, "message": output}): |
st.error( |
"Secret detected in user input. Try again." |
) |
else: |
st.write(output) |
else: |
st.write_stream( |
llm.stream_request(deployment_name, secret, txt) |
) |
with st.container(border=True): |
secret_guess = st.text_input( |
"What is the secret?", |
key=f"guess_{name}", |
placeholder="Your guess", |
) |
btn_submit_guess = st.button( |
"Submit guess", key=f"submit_guess_{name}" |
) |
if btn_submit_guess: |
st.session_state[f"secret_guess_count_{name}"] += 1 |
if secret_guess.lower() == secret.lower(): |
st.success("You found the secret!") |
st.session_state[f"solved_{name}"] = True |
else: |
st.error("Wrong guess. Try again.") |
with col2: |
with st.container(border=True, height=600): |
st.info( |
"There are three levels of hints available to you. But be careful, if you open a hint before solving the secret, it will show up in your record.", |
icon="βΉοΈ", |
) |
hint_1_cont = stylable_container("hint_1_container", hint_css) |
hint1 = hint_1_cont.checkbox( |
"Hint 1 - **Description of security strategy**", |
key=f"hint1_checkbox_{name}", |
) |
if hint1: |
st.session_state[f"opend_hint_{name}_0"] = ( |
True |
if st.session_state[f"opend_hint_{name}_0"] |
else not st.session_state[f"solved_{name}"] |
) |
hint_1_cont.write(config.LEVEL_DESCRIPTIONS[level_num]["info"]) |
hint_2_cont = stylable_container("hint_2_container", hint_css) |
hint2 = hint_2_cont.checkbox( |
"Hint 2 - **Code execution**", key=f"hint2_checkbox_{name}" |
) |
if hint2: |
st.session_state[f"opend_hint_{name}_1"] = ( |
True |
if st.session_state[f"opend_hint_{name}_1"] |
else not st.session_state[f"solved_{name}"] |
) |
def show_base_prompt(): |
for key, val in prompts.items(): |
descr = key.replace("_", " ").capitalize() |
hint_2_cont.write(f"*{descr}:*") |
hint_2_cont.code(val, language=None) |
user_input_holder = ( |
txt if len(txt) > 0 and btn_submit_prompt else None |
) |
prompts = llm.get_full_prompt( |
deployment_name, user_input=user_input_holder |
) |
if level_num == 3: |
special_prompt = llm.get_full_prompt( |
llm.special_checks[3], user_input=txt |
) |
hint_2_cont.write( |
"Step 1: A **LLM judge** reviews the user input and determines if it is malicious or not." |
) |
hint_2_cont.write("**LLM judge prompt:**") |
for key, val in special_prompt.items(): |
hint_2_cont.code(val, language=None) |
hint_2_cont.write( |
"Step 2: If the user input is not classified as malicious, the prompt containing the actual secret is executed and the response is shown." |
) |
hint_2_cont.write("**Actual prompt:**") |
show_base_prompt() |
elif level_num == 4: |
hint_2_cont.write( |
"Step 1: The following prompt is executed:" |
) |
show_base_prompt() |
hint_2_cont.write( |
"Step 2: In the response text of the LLM call, the secret is seach for with a simple python expression `secret.lower() in output.lower()`. If it contains the secret, the output will not be shown." |
) |
elif level_num == 5: |
hint_2_cont.write( |
"Step 1: The following prompt is executed:" |
) |
show_base_prompt() |
hint_2_cont.write( |
"Step 2: In the response text of the LLM call, the secret is seach for with a python function 'is_subsequence' which looks for substrings. If it contains the secret, the output will not be shown." |
) |
llm.is_subsequence |
else: |
hint_2_cont.write( |
"Step 1: The following prompt is executed and the response is shown:" |
) |
show_base_prompt() |
hint_3_cont = stylable_container("hint_3_container", hint_css) |
hint3 = hint_3_cont.checkbox( |
"Hint 3 - **Example solution**", |
key=f"hint3_checkbox_{name}", |
) |
if hint3: |
st.session_state[f"opend_hint_{name}_2"] = ( |
True |
if st.session_state[f"opend_hint_{name}_2"] |
else not st.session_state[f"solved_{name}"] |
) |
hint_3_cont.code( |
config.LEVEL_DESCRIPTIONS[level_num]["solution"], |
language=None, |
) |
hint_3_cont.info("*May not allways work") |
with st.expander("π Record", expanded=True): |
table_data = [] |
for idx, name in enumerate(level_names): |
table_data.append( |
[ |
idx, |
st.session_state[f"prompt_try_count_{name}"], |
st.session_state[f"secret_guess_count_{name}"], |
"β" if st.session_state[f"opend_hint_{name}_0"] else "-", |
"β" if st.session_state[f"opend_hint_{name}_1"] else "-", |
"β" if st.session_state[f"opend_hint_{name}_2"] else "-", |
" if st.session_state[f"solved_{name}"] else "β", |
secrets[idx] if st.session_state[f"solved_{name}"] else "...", |
] |
) |
st.table( |
pd.DataFrame( |
table_data, |
columns=[ |
"Level", |
"Prompt tries", |
"Secret guesses", |
"Used hint 1", |
"Used hint 2", |
"Used hint 3", |
"Solved", |
"Secret", |
], |
index=level_emojis, |
) |
) |