Spaces:

wandb
/

guardrails-genie

Running

App Files Files Community

geekyrakshit commited on Nov 25, 2024

Commit

af688eb

1 Parent(s): 96b1c8c

add: summarizaion to guardrails

Browse files

Files changed (5) hide show

app.py +3 -1
application_pages/chat_app.py +39 -38
guardrails_genie/guardrails/injection/protectai_guardrail.py +6 -1
guardrails_genie/guardrails/injection/survey_guardrail.py +11 -2
guardrails_genie/guardrails/manager.py +5 -2

app.py CHANGED Viewed

@@ -4,7 +4,9 @@ intro_page = st.Page(
     "application_pages/intro_page.py", title="Introduction", icon=":material/guardian:"
 )
 chat_page = st.Page(
-    "application_pages/chat_app.py", title="Chat", icon=":material/robot:"
 )
 evaluation_page = st.Page(
     "application_pages/evaluation_app.py",

     "application_pages/intro_page.py", title="Introduction", icon=":material/guardian:"
 )
 chat_page = st.Page(
+    "application_pages/chat_app.py",
+    title="Playground",
+    icon=":material/sports_esports:",
 )
 evaluation_page = st.Page(
     "application_pages/evaluation_app.py",

application_pages/chat_app.py CHANGED Viewed

@@ -7,19 +7,27 @@ from dotenv import load_dotenv
 from guardrails_genie.guardrails import GuardrailManager
 from guardrails_genie.llm import OpenAIModel
 load_dotenv()
 weave.init(project_name="guardrails-genie")
-st.title(":material/robot: Guardrails Genie")
 if "guardrails" not in st.session_state:
     st.session_state.guardrails = []
 if "guardrail_names" not in st.session_state:
     st.session_state.guardrail_names = []
 if "guardrails_manager" not in st.session_state:
     st.session_state.guardrails_manager = None
-if "chat_started" not in st.session_state:
-    st.session_state.chat_started = False
 def initialize_guardrails():
@@ -67,48 +75,41 @@ guardrail_names = st.sidebar.multiselect(
 )
 st.session_state.guardrail_names = guardrail_names
-if st.sidebar.button("Start Chat") and chat_condition:
-    st.session_state.chat_started = True
-if st.session_state.chat_started:
     with st.sidebar.status("Initializing Guardrails..."):
         initialize_guardrails()
-    # Initialize chat history
-    if "messages" not in st.session_state:
-        st.session_state.messages = []
-    llm_model = OpenAIModel(model_name=openai_model)
-    # Display chat messages from history on app rerun
-    for message in st.session_state.messages:
-        with st.chat_message(message["role"]):
-            st.markdown(message["content"])
-    # React to user input
-    if prompt := st.chat_input("What is up?"):
-        # Display user message in chat message container
-        st.chat_message("user").markdown(prompt)
-        # Add user message to chat history
-        st.session_state.messages.append({"role": "user", "content": prompt})
-        guardrails_response, call = st.session_state.guardrails_manager.guard.call(
-            st.session_state.guardrails_manager, prompt=prompt
-        )
         if guardrails_response["safe"]:
-            response, call = llm_model.predict.call(
-                llm_model, user_prompts=prompt, messages=st.session_state.messages
             )
-            response = response.choices[0].message.content
-            # Display assistant response in chat message container
-            with st.chat_message("assistant"):
-                st.markdown(response + f"\n\n---\n[Explore in Weave]({call.ui_url})")
-            # Add assistant response to chat history
-            st.session_state.messages.append({"role": "assistant", "content": response})
         else:
-            st.error("Guardrails detected an issue with the prompt.")
-            for alert in guardrails_response["alerts"]:
-                st.error(f"{alert['guardrail_name']}: {alert['response']}")
-            st.error(f"For details, explore in Weave at {call.ui_url}")

 from guardrails_genie.guardrails import GuardrailManager
 from guardrails_genie.llm import OpenAIModel
+st.title(":material/robot: Guardrails Genie Playground")
 load_dotenv()
 weave.init(project_name="guardrails-genie")
 if "guardrails" not in st.session_state:
     st.session_state.guardrails = []
 if "guardrail_names" not in st.session_state:
     st.session_state.guardrail_names = []
 if "guardrails_manager" not in st.session_state:
     st.session_state.guardrails_manager = None
+if "initialize_guardrails" not in st.session_state:
+    st.session_state.initialize_guardrails = False
+if "system_prompt" not in st.session_state:
+    st.session_state.system_prompt = ""
+if "user_prompt" not in st.session_state:
+    st.session_state.user_prompt = ""
+if "test_guardrails" not in st.session_state:
+    st.session_state.test_guardrails = False
+if "llm_model" not in st.session_state:
+    st.session_state.llm_model = None
 def initialize_guardrails():
 )
 st.session_state.guardrail_names = guardrail_names
+if st.sidebar.button("Initialize Guardrails") and chat_condition:
+    st.session_state.initialize_guardrails = True
+if st.session_state.initialize_guardrails:
     with st.sidebar.status("Initializing Guardrails..."):
         initialize_guardrails()
+        st.session_state.llm_model = OpenAIModel(model_name=openai_model)
+    user_prompt = st.text_area("User Prompt", value="")
+    st.session_state.user_prompt = user_prompt
+    test_guardrails_button = st.button("Test Guardrails")
+    st.session_state.test_guardrails = test_guardrails_button
+    if st.session_state.test_guardrails:
+        with st.sidebar.status("Running Guardrails..."):
+            guardrails_response, call = st.session_state.guardrails_manager.guard.call(
+                st.session_state.guardrails_manager, prompt=st.session_state.user_prompt
+            )
         if guardrails_response["safe"]:
+            st.markdown(
+                f"\n\n---\nPrompt is safe! Explore prompt trace on [Weave]({call.ui_url})\n\n---\n"
             )
+            with st.sidebar.status("Generating response from LLM..."):
+                response, call = st.session_state.llm_model.predict.call(
+                    st.session_state.llm_model,
+                    user_prompts=st.session_state.user_prompt,
+                )
+            st.markdown(
+                response.choices[0].message.content
+                + f"\n\n---\nExplore LLM generation trace on [Weave]({call.ui_url})"
+            )
         else:
+            st.warning("Prompt is not safe!")
+            st.markdown(guardrails_response["summary"])
+            st.markdown(f"Explore prompt trace on [Weave]({call.ui_url})")

guardrails_genie/guardrails/injection/protectai_guardrail.py CHANGED Viewed

@@ -35,4 +35,9 @@ class PromptInjectionProtectAIGuardrail(Guardrail):
     @weave.op()
     def guard(self, prompt: str):
-        return self.predict(prompt)

     @weave.op()
     def guard(self, prompt: str):
+        response = self.classify(prompt)
+        confidence_percentage = round(response[0]["score"] * 100, 2)
+        return {
+            "safe": response[0]["label"] != "INJECTION",
+            "summary": f"Prompt is deemed {response[0]['label']} with {confidence_percentage}% confidence.",
+        }

guardrails_genie/guardrails/injection/survey_guardrail.py CHANGED Viewed

@@ -70,8 +70,17 @@ Here are some strict instructions that you must follow:
             **kwargs,
         )
         response = chat_completion.choices[0].message.parsed
-        return {"safe": not response.injection_prompt}
     @weave.op()
     def guard(self, prompt: str, **kwargs) -> list[str]:
-        return self.predict(prompt, **kwargs)

             **kwargs,
         )
         response = chat_completion.choices[0].message.parsed
+        return response
     @weave.op()
     def guard(self, prompt: str, **kwargs) -> list[str]:
+        response = self.predict(prompt, **kwargs)
+        summary = (
+            f"Prompt is deemed safe. {response.explanation}"
+            if not response.injection_prompt
+            else f"Prompt is deemed a {'direct attack' if response.is_direct_attack else 'indirect attack'} of type {response.attack_type}. {response.explanation}"
+        )
+        return {
+            "safe": not response.injection_prompt,
+            "summary": summary,
+        }

guardrails_genie/guardrails/manager.py CHANGED Viewed

@@ -9,7 +9,7 @@ class GuardrailManager(weave.Model):
     @weave.op()
     def guard(self, prompt: str, progress_bar: bool = True, **kwargs) -> dict:
-        alerts, safe = [], True
         iterable = (
             track(self.guardrails, description="Running guardrails")
             if progress_bar
@@ -21,7 +21,10 @@ class GuardrailManager(weave.Model):
                 {"guardrail_name": guardrail.__class__.__name__, "response": response}
             )
             safe = safe and response["safe"]
-        return {"safe": safe, "alerts": alerts}
     @weave.op()
     def predict(self, prompt: str, **kwargs) -> dict:

     @weave.op()
     def guard(self, prompt: str, progress_bar: bool = True, **kwargs) -> dict:
+        alerts, summaries, safe = [], "", True
         iterable = (
             track(self.guardrails, description="Running guardrails")
             if progress_bar
                 {"guardrail_name": guardrail.__class__.__name__, "response": response}
             )
             safe = safe and response["safe"]
+            summaries += (
+                f"**{guardrail.__class__.__name__}**: {response['summary']}\n\n---\n\n"
+            )
+        return {"safe": safe, "alerts": alerts, "summary": summaries}
     @weave.op()
     def predict(self, prompt: str, **kwargs) -> dict: