Spaces:

wandb
/

guardrails-genie

Running

@@ -1,52 +1,16 @@
 import streamlit as st
-import weave
-from dotenv import load_dotenv
-from guardrails_genie.llm import OpenAIModel
-load_dotenv()
-weave.init(project_name="guardrails-genie")
-openai_model = st.sidebar.selectbox("OpenAI LLM", ["", "gpt-4o-mini", "gpt-4o"])
-chat_condition = openai_model != ""
-# Use session state to track if the chat has started
-if "chat_started" not in st.session_state:
-    st.session_state.chat_started = False
-# Start chat when button is pressed
-if st.sidebar.button("Start Chat") and chat_condition:
-    st.session_state.chat_started = True
-# Display chat UI if chat has started
-if st.session_state.chat_started:
-    st.title("Guardrails Genie")
-    # Initialize chat history
-    if "messages" not in st.session_state:
-        st.session_state.messages = []
-    llm_model = OpenAIModel(model_name=openai_model)
-    # Display chat messages from history on app rerun
-    for message in st.session_state.messages:
-        with st.chat_message(message["role"]):
-            st.markdown(message["content"])
-    # React to user input
-    if prompt := st.chat_input("What is up?"):
-        # Display user message in chat message container
-        st.chat_message("user").markdown(prompt)
-        # Add user message to chat history
-        st.session_state.messages.append({"role": "user", "content": prompt})
-        response, call = llm_model.predict.call(
-            llm_model, user_prompts=prompt, messages=st.session_state.messages
-        )
-        response = response.choices[0].message.content
-        # Display assistant response in chat message container
-        with st.chat_message("assistant"):
-            st.markdown(response + f"\n\n---\n[Explore in Weave]({call.ui_url})")
-        # Add assistant response to chat history
-        st.session_state.messages.append({"role": "assistant", "content": response})

 import streamlit as st
+intro_page = st.Page(
+    "application_pages/intro_page.py", title="Introduction", icon=":material/guardian:"
+)
+chat_page = st.Page(
+    "application_pages/chat_app.py", title="Chat", icon=":material/robot:"
+)
+evaluation_page = st.Page(
+    "application_pages/evaluation_app.py",
+    title="Evaluation",
+    icon=":material/monitoring:",
+)
+page_navigation = st.navigation([intro_page, chat_page, evaluation_page])
+st.set_page_config(page_title="Guardrails Genie", page_icon=":material/guardian:")
+page_navigation.run()

application_pages/chat_app.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import importlib
+import streamlit as st
+import weave
+from dotenv import load_dotenv
+from guardrails_genie.guardrails import GuardrailManager
+from guardrails_genie.llm import OpenAIModel
+load_dotenv()
+weave.init(project_name="guardrails-genie")
+st.title(":material/robot: Guardrails Genie")
+if "guardrails" not in st.session_state:
+    st.session_state.guardrails = []
+if "guardrail_names" not in st.session_state:
+    st.session_state.guardrail_names = []
+if "guardrails_manager" not in st.session_state:
+    st.session_state.guardrails_manager = None
+if "chat_started" not in st.session_state:
+    st.session_state.chat_started = False
+def initialize_guardrails():
+    st.session_state.guardrails = []
+    for guardrail_name in st.session_state.guardrail_names:
+        if guardrail_name == "PromptInjectionSurveyGuardrail":
+            survey_guardrail_model = st.sidebar.selectbox(
+                "Survey Guardrail LLM", ["", "gpt-4o-mini", "gpt-4o"]
+            )
+            if survey_guardrail_model:
+                st.session_state.guardrails.append(
+                    getattr(
+                        importlib.import_module("guardrails_genie.guardrails"),
+                        guardrail_name,
+                    )(llm_model=OpenAIModel(model_name=survey_guardrail_model))
+                )
+        else:
+            st.session_state.guardrails.append(
+                getattr(
+                    importlib.import_module("guardrails_genie.guardrails"),
+                    guardrail_name,
+                )()
+            )
+    st.session_state.guardrails_manager = GuardrailManager(
+        guardrails=st.session_state.guardrails
+    )
+openai_model = st.sidebar.selectbox(
+    "OpenAI LLM for Chat", ["", "gpt-4o-mini", "gpt-4o"]
+)
+chat_condition = openai_model != ""
+guardrails = []
+guardrail_names = st.sidebar.multiselect(
+    label="Select Guardrails",
+    options=[
+        cls_name
+        for cls_name, cls_obj in vars(
+            importlib.import_module("guardrails_genie.guardrails")
+        ).items()
+        if isinstance(cls_obj, type) and cls_name != "GuardrailManager"
+    ],
+)
+st.session_state.guardrail_names = guardrail_names
+if st.sidebar.button("Start Chat") and chat_condition:
+    st.session_state.chat_started = True
+if st.session_state.chat_started:
+    with st.sidebar.status("Initializing Guardrails..."):
+        initialize_guardrails()
+    st.title("Guardrails Genie")
+    # Initialize chat history
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    llm_model = OpenAIModel(model_name=openai_model)
+    # Display chat messages from history on app rerun
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # React to user input
+    if prompt := st.chat_input("What is up?"):
+        # Display user message in chat message container
+        st.chat_message("user").markdown(prompt)
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        guardrails_response, call = st.session_state.guardrails_manager.guard.call(
+            st.session_state.guardrails_manager, prompt=prompt
+        )
+        if guardrails_response["safe"]:
+            response, call = llm_model.predict.call(
+                llm_model, user_prompts=prompt, messages=st.session_state.messages
+            )
+            response = response.choices[0].message.content
+            # Display assistant response in chat message container
+            with st.chat_message("assistant"):
+                st.markdown(response + f"\n\n---\n[Explore in Weave]({call.ui_url})")
+            # Add assistant response to chat history
+            st.session_state.messages.append({"role": "assistant", "content": response})
+        else:
+            st.error("Guardrails detected an issue with the prompt.")
+            for alert in guardrails_response["alerts"]:
+                st.error(f"{alert['guardrail_name']}: {alert['response']}")
+            st.error(f"For details, explore in Weave at {call.ui_url}")

application_pages/evaluation_app.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import streamlit as st
2	+
3	+ st.title(":material/monitoring: Evaluation")

application_pages/intro_page.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import streamlit as st
+st.title("🧞‍♂️ Guardrails Genie")
+st.write(
+    "Guardrails-Genie is a tool that helps you implement guardrails in your LLM applications."
+)

guardrails_genie/guardrails/__init__.py CHANGED Viewed

@@ -1,3 +1,8 @@
-from .injection import SurveyGuardrail
-__all__ = ["SurveyGuardrail"]

+from .injection import PromptInjectionProtectAIGuardrail, PromptInjectionSurveyGuardrail
+from .manager import GuardrailManager
+__all__ = [
+    "PromptInjectionSurveyGuardrail",
+    "PromptInjectionProtectAIGuardrail",
+    "GuardrailManager",
+]

guardrails_genie/guardrails/base.py CHANGED Viewed

@@ -11,7 +11,3 @@ class Guardrail(weave.Model):
     @weave.op()
     def guard(self, prompt: str, **kwargs) -> list[str]:
         pass
-    @weave.op()
-    def predict(self, prompt: str, **kwargs) -> list[str]:
-        return self.guard(prompt, **kwargs)

     @weave.op()
     def guard(self, prompt: str, **kwargs) -> list[str]:
         pass

guardrails_genie/guardrails/injection/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
-from .survey_guardrail import SurveyGuardrail
-__all__ = ["SurveyGuardrail"]

+from .protectai_guardrail import PromptInjectionProtectAIGuardrail
+from .survey_guardrail import PromptInjectionSurveyGuardrail
+__all__ = ["PromptInjectionSurveyGuardrail", "PromptInjectionProtectAIGuardrail"]

guardrails_genie/guardrails/injection/protectai_guardrail.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from typing import Optional
+import torch
+import weave
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+from transformers.pipelines.base import Pipeline
+from ..base import Guardrail
+class PromptInjectionProtectAIGuardrail(Guardrail):
+    model_name: str = "ProtectAI/deberta-v3-base-prompt-injection-v2"
+    _classifier: Optional[Pipeline] = None
+    def model_post_init(self, __context):
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
+        self._classifier = pipeline(
+            "text-classification",
+            model=model,
+            tokenizer=tokenizer,
+            truncation=True,
+            max_length=512,
+            device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+        )
+    @weave.op()
+    def predict(self, prompt: str):
+        return self._classifier(prompt)
+    @weave.op()
+    def guard(self, prompt: str):
+        response = self.predict(prompt)
+        return {"safe": response[0]["label"] != "INJECTION"}

guardrails_genie/guardrails/injection/survey_guardrail.py CHANGED Viewed

@@ -15,9 +15,9 @@ class SurveyGuardrailResponse(BaseModel):
     explanation: Optional[str]
-class SurveyGuardrail(Guardrail):
     llm_model: OpenAIModel
     @weave.op()
     def load_prompt_injection_survey(self) -> str:
         prompt_injection_survey_path = os.path.join(
@@ -61,7 +61,7 @@ Here are some strict instructions that you must follow:
         return user_prompt, system_prompt
     @weave.op()
-    def guard(self, prompt: str, **kwargs) -> list[str]:
         user_prompt, system_prompt = self.format_prompts(prompt)
         chat_completion = self.llm_model.predict(
             user_prompts=user_prompt,
@@ -70,3 +70,8 @@ Here are some strict instructions that you must follow:
             **kwargs,
         )
         return chat_completion.choices[0].message.parsed

     explanation: Optional[str]
+class PromptInjectionSurveyGuardrail(Guardrail):
     llm_model: OpenAIModel
     @weave.op()
     def load_prompt_injection_survey(self) -> str:
         prompt_injection_survey_path = os.path.join(
         return user_prompt, system_prompt
     @weave.op()
+    def predict(self, prompt: str, **kwargs) -> list[str]:
         user_prompt, system_prompt = self.format_prompts(prompt)
         chat_completion = self.llm_model.predict(
             user_prompts=user_prompt,
             **kwargs,
         )
         return chat_completion.choices[0].message.parsed
+    @weave.op()
+    def guard(self, prompt: str, **kwargs) -> list[str]:
+        response = self.predict(prompt, **kwargs)
+        return {"safe": not response.injection_prompt}

guardrails_genie/guardrails/manager.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import weave
+from rich.progress import track
+from weave.flow.obj import Object as WeaveObject
+from .base import Guardrail
+class GuardrailManager(WeaveObject):
+    guardrails: list[Guardrail]
+    @weave.op()
+    def guard(self, prompt: str, **kwargs) -> dict:
+        alerts, safe = [], True
+        for guardrail in track(self.guardrails, description="Running guardrails"):
+            response = guardrail.guard(prompt, **kwargs)
+            alerts.append(
+                {"guardrail_name": guardrail.__class__.__name__, "response": response}
+            )
+            safe = safe and response["safe"]
+        return {"safe": safe, "alerts": alerts}

pyproject.toml CHANGED Viewed

@@ -12,7 +12,7 @@ dependencies = [
     "ruff>=0.6.9",
     "pip>=24.2",
     "uv>=0.4.20",
-    "weave>=0.51.19",
     "streamlit>=1.40.1",
     "python-dotenv>=1.0.1",
     "watchdog>=6.0.0",

     "ruff>=0.6.9",
     "pip>=24.2",
     "uv>=0.4.20",
+    "weave>=0.51.22",
     "streamlit>=1.40.1",
     "python-dotenv>=1.0.1",
     "watchdog>=6.0.0",