asofter's picture
* upgrade of the version
05bf37a
raw
history blame contribute delete
No virus
15.6 kB
import logging
import time
from datetime import timedelta
from typing import Dict, List
import streamlit as st
from llm_guard.input_scanners import get_scanner_by_name
from llm_guard.input_scanners.anonymize import default_entity_types
from llm_guard.input_scanners.code import SUPPORTED_LANGUAGES as SUPPORTED_CODE_LANGUAGES
from llm_guard.input_scanners.gibberish import MatchType as GibberishMatchType
from llm_guard.input_scanners.language import MatchType as LanguageMatchType
from llm_guard.input_scanners.prompt_injection import MatchType as PromptInjectionMatchType
from llm_guard.input_scanners.toxicity import MatchType as ToxicityMatchType
from llm_guard.vault import Vault
from streamlit_tags import st_tags
logger = logging.getLogger("llm-guard-playground")
def init_settings() -> (List, Dict):
all_scanners = [
"Anonymize",
"BanCode",
"BanCompetitors",
"BanSubstrings",
"BanTopics",
"Code",
"Gibberish",
"Language",
"PromptInjection",
"Regex",
"Secrets",
"Sentiment",
"TokenLimit",
"Toxicity",
]
st_enabled_scanners = st.sidebar.multiselect(
"Select scanners",
options=all_scanners,
default=all_scanners,
help="The list can be found here: https://llm-guard.com/input_scanners/anonymize/",
)
settings = {}
if "Anonymize" in st_enabled_scanners:
st_anon_expander = st.sidebar.expander(
"Anonymize",
expanded=False,
)
with st_anon_expander:
st_anon_entity_types = st_tags(
label="Anonymize entities",
text="Type and press enter",
value=default_entity_types,
suggestions=default_entity_types
+ ["DATE_TIME", "NRP", "LOCATION", "MEDICAL_LICENSE", "US_PASSPORT"],
maxtags=30,
key="anon_entity_types",
)
st.caption(
"Check all supported entities: https://llm-guard.com/input_scanners/anonymize/"
)
st_anon_hidden_names = st_tags(
label="Hidden names to be anonymized",
text="Type and press enter",
value=[],
suggestions=[],
maxtags=30,
key="anon_hidden_names",
)
st.caption("These names will be hidden e.g. [REDACTED_CUSTOM1].")
st_anon_allowed_names = st_tags(
label="Allowed names to ignore",
text="Type and press enter",
value=[],
suggestions=[],
maxtags=30,
key="anon_allowed_names",
)
st.caption("These names will be ignored even if flagged by the detector.")
st_anon_preamble = st.text_input(
"Preamble", value="Text to prepend to sanitized prompt: "
)
st_anon_use_faker = st.checkbox(
"Use Faker",
value=False,
help="Use Faker library to generate fake data",
key="anon_use_faker",
)
st_anon_threshold = st.slider(
label="Threshold",
value=0.0,
min_value=0.0,
max_value=1.0,
step=0.1,
key="anon_threshold",
)
settings["Anonymize"] = {
"entity_types": st_anon_entity_types,
"hidden_names": st_anon_hidden_names,
"allowed_names": st_anon_allowed_names,
"preamble": st_anon_preamble,
"use_faker": st_anon_use_faker,
"threshold": st_anon_threshold,
}
if "BanCode" in st_enabled_scanners:
st_bc_expander = st.sidebar.expander(
"Ban Code",
expanded=False,
)
with st_bc_expander:
st_bc_threshold = st.slider(
label="Threshold",
value=0.95,
min_value=0.0,
max_value=1.0,
step=0.05,
key="ban_code_threshold",
)
settings["BanCode"] = {
"threshold": st_bc_threshold,
}
if "BanCompetitors" in st_enabled_scanners:
st_bc_expander = st.sidebar.expander(
"Ban Competitors",
expanded=False,
)
with st_bc_expander:
st_bc_competitors = st_tags(
label="List of competitors",
text="Type and press enter",
value=["openai", "anthropic", "deepmind", "google"],
suggestions=[],
maxtags=30,
key="bc_competitors",
)
st_bc_threshold = st.slider(
label="Threshold",
value=0.5,
min_value=0.0,
max_value=1.0,
step=0.05,
key="ban_competitors_threshold",
)
settings["BanCompetitors"] = {
"competitors": st_bc_competitors,
"threshold": st_bc_threshold,
}
if "BanSubstrings" in st_enabled_scanners:
st_bs_expander = st.sidebar.expander(
"Ban Substrings",
expanded=False,
)
with st_bs_expander:
st_bs_substrings = st.text_area(
"Enter substrings to ban (one per line)",
value="test\nhello\nworld",
height=200,
).split("\n")
st_bs_match_type = st.selectbox(
"Match type", ["str", "word"], index=0, key="bs_match_type"
)
st_bs_case_sensitive = st.checkbox(
"Case sensitive", value=False, key="bs_case_sensitive"
)
st_bs_redact = st.checkbox("Redact", value=False, key="bs_redact")
st_bs_contains_all = st.checkbox("Contains all", value=False, key="bs_contains_all")
settings["BanSubstrings"] = {
"substrings": st_bs_substrings,
"match_type": st_bs_match_type,
"case_sensitive": st_bs_case_sensitive,
"redact": st_bs_redact,
"contains_all": st_bs_contains_all,
}
if "BanTopics" in st_enabled_scanners:
st_bt_expander = st.sidebar.expander(
"Ban Topics",
expanded=False,
)
with st_bt_expander:
st_bt_topics = st_tags(
label="List of topics",
text="Type and press enter",
value=["violence"],
suggestions=[],
maxtags=30,
key="bt_topics",
)
st_bt_threshold = st.slider(
label="Threshold",
value=0.6,
min_value=0.0,
max_value=1.0,
step=0.05,
key="ban_topics_threshold",
)
settings["BanTopics"] = {
"topics": st_bt_topics,
"threshold": st_bt_threshold,
}
if "Code" in st_enabled_scanners:
st_cd_expander = st.sidebar.expander(
"Code",
expanded=False,
)
with st_cd_expander:
st_cd_languages = st.multiselect(
"Programming languages",
SUPPORTED_CODE_LANGUAGES,
default=["Python"],
)
st_cd_is_blocked = st.checkbox("Is blocked", value=False, key="code_is_blocked")
settings["Code"] = {
"languages": st_cd_languages,
"is_blocked": st_cd_is_blocked,
}
if "Gibberish" in st_enabled_scanners:
st_gib_expander = st.sidebar.expander(
"Gibberish",
expanded=False,
)
with st_gib_expander:
st_gib_threshold = st.slider(
label="Threshold",
value=0.7,
min_value=0.0,
max_value=1.0,
step=0.1,
key="gibberish_threshold",
)
st_gib_match_type = st.selectbox(
"Match type",
[e.value for e in GibberishMatchType],
index=1,
key="gibberish_match_type",
)
settings["Gibberish"] = {
"threshold": st_gib_threshold,
"match_type": st_gib_match_type,
}
if "Language" in st_enabled_scanners:
st_lan_expander = st.sidebar.expander(
"Language",
expanded=False,
)
with st_lan_expander:
st_lan_valid_language = st.multiselect(
"Languages",
[
"ar",
"bg",
"de",
"el",
"en",
"es",
"fr",
"hi",
"it",
"ja",
"nl",
"pl",
"pt",
"ru",
"sw",
"th",
"tr",
"ur",
"vi",
"zh",
],
default=["en"],
)
st_lan_match_type = st.selectbox(
"Match type",
[e.value for e in LanguageMatchType],
index=1,
key="language_match_type",
)
settings["Language"] = {
"valid_languages": st_lan_valid_language,
"match_type": st_lan_match_type,
}
if "PromptInjection" in st_enabled_scanners:
st_pi_expander = st.sidebar.expander(
"Prompt Injection",
expanded=False,
)
with st_pi_expander:
st_pi_threshold = st.slider(
label="Threshold",
value=0.75,
min_value=0.0,
max_value=1.0,
step=0.05,
key="prompt_injection_threshold",
)
st_pi_match_type = st.selectbox(
"Match type",
[e.value for e in PromptInjectionMatchType],
index=1,
key="prompt_injection_match_type",
)
settings["PromptInjection"] = {
"threshold": st_pi_threshold,
"match_type": st_pi_match_type,
}
if "Regex" in st_enabled_scanners:
st_regex_expander = st.sidebar.expander(
"Regex",
expanded=False,
)
with st_regex_expander:
st_regex_patterns = st.text_area(
"Enter patterns to ban (one per line)",
value="Bearer [A-Za-z0-9-._~+/]+",
height=200,
).split("\n")
st_regex_is_blocked = st.checkbox("Is blocked", value=True, key="regex_is_blocked")
st_regex_redact = st.checkbox(
"Redact",
value=False,
help="Replace the matched bad patterns with [REDACTED]",
key="regex_redact",
)
settings["Regex"] = {
"patterns": st_regex_patterns,
"is_blocked": st_regex_is_blocked,
"redact": st_regex_redact,
}
if "Secrets" in st_enabled_scanners:
st_sec_expander = st.sidebar.expander(
"Secrets",
expanded=False,
)
with st_sec_expander:
st_sec_redact_mode = st.selectbox("Redact mode", ["all", "partial", "hash"])
settings["Secrets"] = {
"redact_mode": st_sec_redact_mode,
}
if "Sentiment" in st_enabled_scanners:
st_sent_expander = st.sidebar.expander(
"Sentiment",
expanded=False,
)
with st_sent_expander:
st_sent_threshold = st.slider(
label="Threshold",
value=-0.5,
min_value=-1.0,
max_value=1.0,
step=0.1,
key="sentiment_threshold",
help="Negative values are negative sentiment, positive values are positive sentiment",
)
settings["Sentiment"] = {
"threshold": st_sent_threshold,
}
if "TokenLimit" in st_enabled_scanners:
st_tl_expander = st.sidebar.expander(
"Token Limit",
expanded=False,
)
with st_tl_expander:
st_tl_limit = st.number_input(
"Limit", value=4096, min_value=0, max_value=10000, step=10
)
st_tl_encoding_name = st.selectbox(
"Encoding name",
["cl100k_base", "p50k_base", "r50k_base"],
index=0,
help="Read more: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb",
)
settings["TokenLimit"] = {
"limit": st_tl_limit,
"encoding_name": st_tl_encoding_name,
}
if "Toxicity" in st_enabled_scanners:
st_tox_expander = st.sidebar.expander(
"Toxicity",
expanded=False,
)
with st_tox_expander:
st_tox_threshold = st.slider(
label="Threshold",
value=0.75,
min_value=0.0,
max_value=1.0,
step=0.05,
key="toxicity_threshold",
)
st_tox_match_type = st.selectbox(
"Match type",
[e.value for e in ToxicityMatchType],
index=1,
key="toxicity_match_type",
)
settings["Toxicity"] = {
"threshold": st_tox_threshold,
"match_type": st_tox_match_type,
}
return st_enabled_scanners, settings
def get_scanner(scanner_name: str, vault: Vault, settings: Dict):
logger.debug(f"Initializing {scanner_name} scanner")
if scanner_name == "Anonymize":
settings["vault"] = vault
if scanner_name in [
"Anonymize",
"BanCode",
"BanTopics",
"Code",
"Gibberish",
"PromptInjection",
"Toxicity",
]:
settings["use_onnx"] = True
return get_scanner_by_name(scanner_name, settings)
def scan(
vault: Vault, enabled_scanners: List[str], settings: Dict, text: str, fail_fast: bool = False
) -> (str, List[Dict[str, any]]):
sanitized_prompt = text
results = []
status_text = "Scanning prompt..."
if fail_fast:
status_text = "Scanning prompt (fail fast mode)..."
with st.status(status_text, expanded=True) as status:
for scanner_name in enabled_scanners:
st.write(f"{scanner_name} scanner...")
scanner = get_scanner(scanner_name, vault, settings[scanner_name])
start_time = time.monotonic()
sanitized_prompt, is_valid, risk_score = scanner.scan(sanitized_prompt)
end_time = time.monotonic()
results.append(
{
"scanner": scanner_name,
"is_valid": is_valid,
"risk_score": risk_score,
"took_sec": round(timedelta(seconds=end_time - start_time).total_seconds(), 2),
}
)
if fail_fast and not is_valid:
break
status.update(label="Scanning complete", state="complete", expanded=False)
return sanitized_prompt, results