|
"""Demo UI to show different levels of LLM security.""" |
|
|
|
import streamlit as st |
|
from streamlit_extras.stylable_container import stylable_container |
|
|
|
import pandas as pd |
|
import llm |
|
|
|
import config |
|
|
|
|
|
st.set_page_config( |
|
page_title="LLM security demo", |
|
page_icon="images/LEG.png", |
|
layout="wide", |
|
initial_sidebar_state="expanded", |
|
) |
|
|
|
if "key" not in st.session_state: |
|
st.session_state["key"] = "value" |
|
|
|
|
|
st.logo("images/ML6_logo.png") |
|
|
|
st.title("π΅οΈ LLM security demo") |
|
st.info( |
|
"You are a secret agent meeting your informant in a bar. Convince him to give you his secret! But be prepared, with every new level the informant will be more cautious." |
|
) |
|
|
|
num_levels = [i for i in range(7)] |
|
|
|
secrets = [ |
|
"Perspicacity", |
|
"Obfuscation", |
|
"Sesquipedalian", |
|
"Synecdoche", |
|
"Defenestration", |
|
"Pulchritudinous", |
|
"Sesquipedalian", |
|
"Ineffable", |
|
] |
|
level_names = [f"Level {i}" for i in num_levels] |
|
level_emojis = ["π΄", "π", "π", "π₯", "π₯΅", "π", "π"] |
|
level_tabs = st.tabs(level_names) |
|
|
|
hint_css = """ |
|
{ |
|
background-color: rgba(110, 110, 110, 0.1); |
|
padding: 16px; |
|
border-radius: 0.5rem; |
|
|
|
|
|
} |
|
code { |
|
white-space: pre-wrap !important; |
|
} |
|
""" |
|
|
|
|
|
for level_num, level_tab, name, emoji, secret in zip( |
|
num_levels, level_tabs, level_names, level_emojis, secrets |
|
): |
|
|
|
if f"solved_{name}" not in st.session_state: |
|
st.session_state[f"solved_{name}"] = False |
|
|
|
|
|
if f"prompt_try_count_{name}" not in st.session_state: |
|
st.session_state[f"prompt_try_count_{name}"] = 0 |
|
|
|
|
|
if f"secret_guess_count_{name}" not in st.session_state: |
|
st.session_state[f"secret_guess_count_{name}"] = 0 |
|
|
|
|
|
for i in range(3): |
|
if f"opend_hint_{name}_{i}" not in st.session_state: |
|
st.session_state[f"opend_hint_{name}_{i}"] = False |
|
|
|
deployment_name = name.replace(" ", "-").lower() |
|
with level_tab: |
|
header_col1, header_col2 = st.columns(2, gap="medium") |
|
header_col1.subheader(f"{emoji} {name}") |
|
header_col2.subheader(f"Need help ...") |
|
|
|
col1, col2 = st.columns(2, gap="medium") |
|
|
|
with col1: |
|
with st.container(height=600, border=False): |
|
with st.container(border=True): |
|
txt = st.text_area( |
|
"Provide your prompt here:", |
|
key=f"txt_{name}", |
|
label_visibility="visible", |
|
height=200, |
|
placeholder="Your prompt", |
|
) |
|
btn_submit_prompt = st.button( |
|
"Send prompt", key=f"submit_prompt_{name}" |
|
) |
|
|
|
|
|
if len(txt) > 0 and btn_submit_prompt: |
|
st.session_state[f"prompt_try_count_{name}"] += 1 |
|
with st.container(border=True): |
|
st.write("Response:") |
|
|
|
if level_num == 3 and llm.run_judge(level_num, {"user_input": txt}): |
|
st.error("Malicious user input detected. Try again.") |
|
elif level_num == 4: |
|
output = "".join( |
|
llm.stream_request(deployment_name, secret, txt) |
|
) |
|
if secret.lower() in output.lower(): |
|
st.error( |
|
"Secret detected in user input. Try again." |
|
) |
|
else: |
|
st.write(output) |
|
elif level_num == 5: |
|
output = "".join( |
|
llm.stream_request(deployment_name, secret, txt) |
|
) |
|
if llm.is_subsequence(output, secret): |
|
st.error( |
|
"Secret detected in user input. Try again." |
|
) |
|
else: |
|
st.write(output) |
|
|
|
elif level_num == 6: |
|
output = "".join( |
|
llm.stream_request(deployment_name, secret, txt) |
|
) |
|
if llm.run_judge(level_num, {"secret": secret, "message": output}): |
|
st.error( |
|
"Secret detected in user input. Try again." |
|
) |
|
else: |
|
st.write(output) |
|
else: |
|
st.write_stream( |
|
llm.stream_request(deployment_name, secret, txt) |
|
) |
|
|
|
with st.container(border=True): |
|
secret_guess = st.text_input( |
|
"What is the secret?", |
|
key=f"guess_{name}", |
|
placeholder="Your guess", |
|
) |
|
btn_submit_guess = st.button( |
|
"Submit guess", key=f"submit_guess_{name}" |
|
) |
|
|
|
if btn_submit_guess: |
|
st.session_state[f"secret_guess_count_{name}"] += 1 |
|
if secret_guess.lower() == secret.lower(): |
|
st.success("You found the secret!") |
|
st.session_state[f"solved_{name}"] = True |
|
else: |
|
st.error("Wrong guess. Try again.") |
|
|
|
with col2: |
|
with st.container(border=True, height=600): |
|
st.info( |
|
"There are three levels of hints available to you. But be careful, if you open a hint before solving the secret, it will show up in your record.", |
|
icon="βΉοΈ", |
|
) |
|
|
|
hint_1_cont = stylable_container("hint_1_container", hint_css) |
|
|
|
hint1 = hint_1_cont.checkbox( |
|
"Hint 1 - **Description of security strategy**", |
|
key=f"hint1_checkbox_{name}", |
|
) |
|
if hint1: |
|
|
|
st.session_state[f"opend_hint_{name}_0"] = ( |
|
True |
|
if st.session_state[f"opend_hint_{name}_0"] |
|
else not st.session_state[f"solved_{name}"] |
|
) |
|
|
|
hint_1_cont.write(config.LEVEL_DESCRIPTIONS[level_num]["info"]) |
|
|
|
hint_2_cont = stylable_container("hint_2_container", hint_css) |
|
hint2 = hint_2_cont.checkbox( |
|
"Hint 2 - **Code execution**", key=f"hint2_checkbox_{name}" |
|
) |
|
if hint2: |
|
st.session_state[f"opend_hint_{name}_1"] = ( |
|
True |
|
if st.session_state[f"opend_hint_{name}_1"] |
|
else not st.session_state[f"solved_{name}"] |
|
) |
|
|
|
def show_base_prompt(): |
|
|
|
for key, val in prompts.items(): |
|
descr = key.replace("_", " ").capitalize() |
|
hint_2_cont.write(f"*{descr}:*") |
|
|
|
|
|
hint_2_cont.code(val, language=None) |
|
|
|
user_input_holder = ( |
|
txt if len(txt) > 0 and btn_submit_prompt else None |
|
) |
|
|
|
prompts = llm.get_full_prompt( |
|
deployment_name, user_input=user_input_holder |
|
) |
|
|
|
if level_num == 3: |
|
special_prompt = llm.get_full_prompt( |
|
llm.special_checks[3], user_input=txt |
|
) |
|
|
|
hint_2_cont.write( |
|
"Step 1: A **LLM judge** reviews the user input and determines if it is malicious or not." |
|
) |
|
hint_2_cont.write("**LLM judge prompt:**") |
|
for key, val in special_prompt.items(): |
|
hint_2_cont.code(val, language=None) |
|
hint_2_cont.write( |
|
"Step 2: If the user input is not classified as malicious, the prompt containing the actual secret is executed and the response is shown." |
|
) |
|
hint_2_cont.write("**Actual prompt:**") |
|
show_base_prompt() |
|
elif level_num == 4: |
|
hint_2_cont.write( |
|
"Step 1: The following prompt is executed:" |
|
) |
|
show_base_prompt() |
|
hint_2_cont.write( |
|
"Step 2: In the response text of the LLM call, the secret is seach for with a simple python expression `secret.lower() in output.lower()`. If it contains the secret, the output will not be shown." |
|
) |
|
elif level_num == 5: |
|
hint_2_cont.write( |
|
"Step 1: The following prompt is executed:" |
|
) |
|
show_base_prompt() |
|
hint_2_cont.write( |
|
"Step 2: In the response text of the LLM call, the secret is seach for with a python function 'is_subsequence' which looks for substrings. If it contains the secret, the output will not be shown." |
|
) |
|
llm.is_subsequence |
|
else: |
|
hint_2_cont.write( |
|
"Step 1: The following prompt is executed and the response is shown:" |
|
) |
|
show_base_prompt() |
|
|
|
|
|
hint_3_cont = stylable_container("hint_3_container", hint_css) |
|
|
|
hint3 = hint_3_cont.checkbox( |
|
"Hint 3 - **Example solution**", |
|
key=f"hint3_checkbox_{name}", |
|
) |
|
if hint3: |
|
st.session_state[f"opend_hint_{name}_2"] = ( |
|
True |
|
if st.session_state[f"opend_hint_{name}_2"] |
|
else not st.session_state[f"solved_{name}"] |
|
) |
|
|
|
|
|
|
|
|
|
hint_3_cont.code( |
|
config.LEVEL_DESCRIPTIONS[level_num]["solution"], |
|
language=None, |
|
) |
|
hint_3_cont.info("*May not allways work") |
|
|
|
|
|
with st.expander("π Record", expanded=True): |
|
|
|
table_data = [] |
|
for idx, name in enumerate(level_names): |
|
table_data.append( |
|
[ |
|
idx, |
|
st.session_state[f"prompt_try_count_{name}"], |
|
st.session_state[f"secret_guess_count_{name}"], |
|
"β" if st.session_state[f"opend_hint_{name}_0"] else "-", |
|
"β" if st.session_state[f"opend_hint_{name}_1"] else "-", |
|
"β" if st.session_state[f"opend_hint_{name}_2"] else "-", |
|
"β
" if st.session_state[f"solved_{name}"] else "β", |
|
secrets[idx] if st.session_state[f"solved_{name}"] else "...", |
|
] |
|
) |
|
|
|
|
|
st.table( |
|
pd.DataFrame( |
|
table_data, |
|
columns=[ |
|
"Level", |
|
"Prompt tries", |
|
"Secret guesses", |
|
"Used hint 1", |
|
"Used hint 2", |
|
"Used hint 3", |
|
"Solved", |
|
"Secret", |
|
], |
|
index=level_emojis, |
|
) |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|